|
9 | 9 |
|
10 | 10 | from django.utils.deprecation import RemovedInDjango60Warning
|
11 | 11 | from django.utils.encoding import punycode
|
12 |
| -from django.utils.functional import Promise, keep_lazy, keep_lazy_text |
| 12 | +from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text |
13 | 13 | from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS
|
14 | 14 | from django.utils.regex_helper import _lazy_re_compile
|
15 | 15 | from django.utils.safestring import SafeData, SafeString, mark_safe
|
@@ -255,6 +255,16 @@ def unquote_quote(segment):
|
255 | 255 | return urlunsplit((scheme, netloc, path, query, fragment))
|
256 | 256 |
|
257 | 257 |
|
| 258 | +class CountsDict(dict): |
| 259 | + def __init__(self, *args, word, **kwargs): |
| 260 | + super().__init__(*args, *kwargs) |
| 261 | + self.word = word |
| 262 | + |
| 263 | + def __missing__(self, key): |
| 264 | + self[key] = self.word.count(key) |
| 265 | + return self[key] |
| 266 | + |
| 267 | + |
258 | 268 | class Urlizer:
|
259 | 269 | """
|
260 | 270 | Convert any URLs in text into clickable links.
|
@@ -360,40 +370,72 @@ def trim_url(self, x, *, limit):
|
360 | 370 | return x
|
361 | 371 | return "%s…" % x[: max(0, limit - 1)]
|
362 | 372 |
|
| 373 | + @cached_property |
| 374 | + def wrapping_punctuation_openings(self): |
| 375 | + return "".join(dict(self.wrapping_punctuation).keys()) |
| 376 | + |
| 377 | + @cached_property |
| 378 | + def trailing_punctuation_chars_no_semicolon(self): |
| 379 | + return self.trailing_punctuation_chars.replace(";", "") |
| 380 | + |
| 381 | + @cached_property |
| 382 | + def trailing_punctuation_chars_has_semicolon(self): |
| 383 | + return ";" in self.trailing_punctuation_chars |
| 384 | + |
363 | 385 | def trim_punctuation(self, word):
|
364 | 386 | """
|
365 | 387 | Trim trailing and wrapping punctuation from `word`. Return the items of
|
366 | 388 | the new state.
|
367 | 389 | """
|
368 |
| - lead, middle, trail = "", word, "" |
| 390 | + # Strip all opening wrapping punctuation. |
| 391 | + middle = word.lstrip(self.wrapping_punctuation_openings) |
| 392 | + lead = word[: len(word) - len(middle)] |
| 393 | + trail = "" |
| 394 | + |
369 | 395 | # Continue trimming until middle remains unchanged.
|
370 | 396 | trimmed_something = True
|
371 |
| - while trimmed_something: |
| 397 | + counts = CountsDict(word=middle) |
| 398 | + while trimmed_something and middle: |
372 | 399 | trimmed_something = False
|
373 | 400 | # Trim wrapping punctuation.
|
374 | 401 | for opening, closing in self.wrapping_punctuation:
|
375 |
| - if middle.startswith(opening): |
376 |
| - middle = middle.removeprefix(opening) |
377 |
| - lead += opening |
378 |
| - trimmed_something = True |
379 |
| - # Keep parentheses at the end only if they're balanced. |
380 |
| - if ( |
381 |
| - middle.endswith(closing) |
382 |
| - and middle.count(closing) == middle.count(opening) + 1 |
383 |
| - ): |
384 |
| - middle = middle.removesuffix(closing) |
385 |
| - trail = closing + trail |
386 |
| - trimmed_something = True |
387 |
| - # Trim trailing punctuation (after trimming wrapping punctuation, |
388 |
| - # as encoded entities contain ';'). Unescape entities to avoid |
389 |
| - # breaking them by removing ';'. |
390 |
| - middle_unescaped = html.unescape(middle) |
391 |
| - stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars) |
392 |
| - if middle_unescaped != stripped: |
393 |
| - punctuation_count = len(middle_unescaped) - len(stripped) |
394 |
| - trail = middle[-punctuation_count:] + trail |
395 |
| - middle = middle[:-punctuation_count] |
| 402 | + if counts[opening] < counts[closing]: |
| 403 | + rstripped = middle.rstrip(closing) |
| 404 | + if rstripped != middle: |
| 405 | + strip = counts[closing] - counts[opening] |
| 406 | + trail = middle[-strip:] |
| 407 | + middle = middle[:-strip] |
| 408 | + trimmed_something = True |
| 409 | + counts[closing] -= strip |
| 410 | + |
| 411 | + rstripped = middle.rstrip(self.trailing_punctuation_chars_no_semicolon) |
| 412 | + if rstripped != middle: |
| 413 | + trail = middle[len(rstripped) :] + trail |
| 414 | + middle = rstripped |
396 | 415 | trimmed_something = True
|
| 416 | + |
| 417 | + if self.trailing_punctuation_chars_has_semicolon and middle.endswith(";"): |
| 418 | + # Only strip if not part of an HTML entity. |
| 419 | + amp = middle.rfind("&") |
| 420 | + if amp == -1: |
| 421 | + can_strip = True |
| 422 | + else: |
| 423 | + potential_entity = middle[amp:] |
| 424 | + escaped = html.unescape(potential_entity) |
| 425 | + can_strip = (escaped == potential_entity) or escaped.endswith(";") |
| 426 | + |
| 427 | + if can_strip: |
| 428 | + rstripped = middle.rstrip(";") |
| 429 | + amount_stripped = len(middle) - len(rstripped) |
| 430 | + if amp > -1 and amount_stripped > 1: |
| 431 | + # Leave a trailing semicolon as might be an entity. |
| 432 | + trail = middle[len(rstripped) + 1 :] + trail |
| 433 | + middle = rstripped + ";" |
| 434 | + else: |
| 435 | + trail = middle[len(rstripped) :] + trail |
| 436 | + middle = rstripped |
| 437 | + trimmed_something = True |
| 438 | + |
397 | 439 | return lead, middle, trail
|
398 | 440 |
|
399 | 441 | @staticmethod
|
|
0 commit comments