md.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. from __future__ import annotations
  2. import sys
  3. from functools import lru_cache
  4. from logging import getLogger
  5. if sys.version_info >= (3, 8):
  6. from typing import final
  7. else:
  8. try:
  9. from typing_extensions import final
  10. except ImportError:
  11. def final(cls): # type: ignore[misc,no-untyped-def]
  12. return cls
  13. from .constant import (
  14. COMMON_CJK_CHARACTERS,
  15. COMMON_SAFE_ASCII_CHARACTERS,
  16. TRACE,
  17. UNICODE_SECONDARY_RANGE_KEYWORD,
  18. _ACCENTUATED,
  19. _ARABIC,
  20. _ARABIC_ISOLATED_FORM,
  21. _CJK,
  22. _HANGUL,
  23. _HIRAGANA,
  24. _KATAKANA,
  25. _LATIN,
  26. _THAI,
  27. )
  28. from .utils import (
  29. _character_flags,
  30. is_emoticon,
  31. is_punctuation,
  32. is_separator,
  33. is_symbol,
  34. remove_accent,
  35. unicode_range,
  36. )
  37. # Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.
  38. _GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI
  39. @final
  40. class CharInfo:
  41. """Pre-computed character properties shared across all detectors.
  42. Instantiated once and reused via :meth:`update` on every character
  43. in the hot loop so that redundant calls to str methods
  44. (``isalpha``, ``isupper``, …) and cached utility functions
  45. (``_character_flags``, ``is_punctuation``, …) are avoided when
  46. several plugins need the same information.
  47. """
  48. __slots__ = (
  49. "character",
  50. "printable",
  51. "alpha",
  52. "upper",
  53. "lower",
  54. "space",
  55. "digit",
  56. "is_ascii",
  57. "case_variable",
  58. "flags",
  59. "accentuated",
  60. "latin",
  61. "is_cjk",
  62. "is_arabic",
  63. "is_glyph",
  64. "punct",
  65. "sym",
  66. )
  67. def __init__(self) -> None:
  68. self.character: str = ""
  69. self.printable: bool = False
  70. self.alpha: bool = False
  71. self.upper: bool = False
  72. self.lower: bool = False
  73. self.space: bool = False
  74. self.digit: bool = False
  75. self.is_ascii: bool = False
  76. self.case_variable: bool = False
  77. self.flags: int = 0
  78. self.accentuated: bool = False
  79. self.latin: bool = False
  80. self.is_cjk: bool = False
  81. self.is_arabic: bool = False
  82. self.is_glyph: bool = False
  83. self.punct: bool = False
  84. self.sym: bool = False
  85. def update(self, character: str) -> None:
  86. """Update all properties for *character* (called once per character)."""
  87. self.character = character
  88. # ASCII fast-path: for characters with ord < 128, we can skip
  89. # _character_flags() entirely and derive most properties from ord.
  90. o: int = ord(character)
  91. if o < 128:
  92. self.is_ascii = True
  93. self.accentuated = False
  94. self.is_cjk = False
  95. self.is_arabic = False
  96. self.is_glyph = False
  97. # ASCII alpha: a-z (97-122) or A-Z (65-90)
  98. if 65 <= o <= 90:
  99. # Uppercase ASCII letter
  100. self.alpha = True
  101. self.upper = True
  102. self.lower = False
  103. self.space = False
  104. self.digit = False
  105. self.printable = True
  106. self.case_variable = True
  107. self.flags = _LATIN
  108. self.latin = True
  109. self.punct = False
  110. self.sym = False
  111. elif 97 <= o <= 122:
  112. # Lowercase ASCII letter
  113. self.alpha = True
  114. self.upper = False
  115. self.lower = True
  116. self.space = False
  117. self.digit = False
  118. self.printable = True
  119. self.case_variable = True
  120. self.flags = _LATIN
  121. self.latin = True
  122. self.punct = False
  123. self.sym = False
  124. elif 48 <= o <= 57:
  125. # ASCII digit 0-9
  126. self.alpha = False
  127. self.upper = False
  128. self.lower = False
  129. self.space = False
  130. self.digit = True
  131. self.printable = True
  132. self.case_variable = False
  133. self.flags = 0
  134. self.latin = False
  135. self.punct = False
  136. self.sym = False
  137. elif o == 32 or (9 <= o <= 13):
  138. # Space, tab, newline, etc.
  139. self.alpha = False
  140. self.upper = False
  141. self.lower = False
  142. self.space = True
  143. self.digit = False
  144. self.printable = o == 32
  145. self.case_variable = False
  146. self.flags = 0
  147. self.latin = False
  148. self.punct = False
  149. self.sym = False
  150. else:
  151. # Other ASCII (punctuation, symbols, control chars)
  152. self.printable = character.isprintable()
  153. self.alpha = False
  154. self.upper = False
  155. self.lower = False
  156. self.space = False
  157. self.digit = False
  158. self.case_variable = False
  159. self.flags = 0
  160. self.latin = False
  161. self.punct = is_punctuation(character) if self.printable else False
  162. self.sym = is_symbol(character) if self.printable else False
  163. else:
  164. # Non-ASCII path
  165. self.is_ascii = False
  166. self.printable = character.isprintable()
  167. self.alpha = character.isalpha()
  168. self.upper = character.isupper()
  169. self.lower = character.islower()
  170. self.space = character.isspace()
  171. self.digit = character.isdigit()
  172. self.case_variable = self.lower != self.upper
  173. # Flag-based classification (single unicodedata.name() call, lru-cached)
  174. flags: int
  175. if self.alpha:
  176. flags = _character_flags(character)
  177. else:
  178. flags = 0
  179. self.flags = flags
  180. self.accentuated = bool(flags & _ACCENTUATED)
  181. self.latin = bool(flags & _LATIN)
  182. self.is_cjk = bool(flags & _CJK)
  183. self.is_arabic = bool(flags & _ARABIC)
  184. self.is_glyph = bool(flags & _GLYPH_MASK)
  185. # Eagerly compute punct and sym (avoids property dispatch overhead
  186. # on 300K+ accesses in the hot loop).
  187. self.punct = is_punctuation(character) if self.printable else False
  188. self.sym = is_symbol(character) if self.printable else False
  189. class MessDetectorPlugin:
  190. """
  191. Base abstract class used for mess detection plugins.
  192. All detectors MUST extend and implement given methods.
  193. """
  194. __slots__ = ()
  195. def feed_info(self, character: str, info: CharInfo) -> None:
  196. """
  197. The main routine to be executed upon character.
  198. Insert the logic in witch the text would be considered chaotic.
  199. """
  200. raise NotImplementedError # Defensive:
  201. def reset(self) -> None: # Defensive:
  202. """
  203. Permit to reset the plugin to the initial state.
  204. """
  205. raise NotImplementedError
  206. @property
  207. def ratio(self) -> float:
  208. """
  209. Compute the chaos ratio based on what your feed() has seen.
  210. Must NOT be lower than 0.; No restriction gt 0.
  211. """
  212. raise NotImplementedError # Defensive:
  213. @final
  214. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  215. __slots__ = (
  216. "_punctuation_count",
  217. "_symbol_count",
  218. "_character_count",
  219. "_last_printable_char",
  220. "_frenzy_symbol_in_word",
  221. )
  222. def __init__(self) -> None:
  223. self._punctuation_count: int = 0
  224. self._symbol_count: int = 0
  225. self._character_count: int = 0
  226. self._last_printable_char: str | None = None
  227. self._frenzy_symbol_in_word: bool = False
  228. def feed_info(self, character: str, info: CharInfo) -> None:
  229. """Optimized feed using pre-computed character info."""
  230. self._character_count += 1
  231. if (
  232. character != self._last_printable_char
  233. and character not in COMMON_SAFE_ASCII_CHARACTERS
  234. ):
  235. if info.punct:
  236. self._punctuation_count += 1
  237. elif not info.digit and info.sym and not is_emoticon(character):
  238. self._symbol_count += 2
  239. self._last_printable_char = character
  240. def reset(self) -> None: # Abstract
  241. self._punctuation_count = 0
  242. self._character_count = 0
  243. self._symbol_count = 0
  244. @property
  245. def ratio(self) -> float:
  246. if self._character_count == 0:
  247. return 0.0
  248. ratio_of_punctuation: float = (
  249. self._punctuation_count + self._symbol_count
  250. ) / self._character_count
  251. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  252. @final
  253. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  254. __slots__ = ("_character_count", "_accentuated_count")
  255. def __init__(self) -> None:
  256. self._character_count: int = 0
  257. self._accentuated_count: int = 0
  258. def feed_info(self, character: str, info: CharInfo) -> None:
  259. """Optimized feed using pre-computed character info."""
  260. self._character_count += 1
  261. if info.accentuated:
  262. self._accentuated_count += 1
  263. def reset(self) -> None: # Abstract
  264. self._character_count = 0
  265. self._accentuated_count = 0
  266. @property
  267. def ratio(self) -> float:
  268. if self._character_count < 8:
  269. return 0.0
  270. ratio_of_accentuation: float = self._accentuated_count / self._character_count
  271. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  272. @final
  273. class UnprintablePlugin(MessDetectorPlugin):
  274. __slots__ = ("_unprintable_count", "_character_count")
  275. def __init__(self) -> None:
  276. self._unprintable_count: int = 0
  277. self._character_count: int = 0
  278. def feed_info(self, character: str, info: CharInfo) -> None:
  279. """Optimized feed using pre-computed character info."""
  280. if (
  281. not info.space
  282. and not info.printable
  283. and character != "\x1a"
  284. and character != "\ufeff"
  285. ):
  286. self._unprintable_count += 1
  287. self._character_count += 1
  288. def reset(self) -> None: # Abstract
  289. self._unprintable_count = 0
  290. @property
  291. def ratio(self) -> float:
  292. if self._character_count == 0: # Defensive:
  293. return 0.0
  294. return (self._unprintable_count * 8) / self._character_count
  295. @final
  296. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  297. __slots__ = (
  298. "_successive_count",
  299. "_character_count",
  300. "_last_latin_character",
  301. "_last_was_accentuated",
  302. )
  303. def __init__(self) -> None:
  304. self._successive_count: int = 0
  305. self._character_count: int = 0
  306. self._last_latin_character: str | None = None
  307. self._last_was_accentuated: bool = False
  308. def feed_info(self, character: str, info: CharInfo) -> None:
  309. """Optimized feed using pre-computed character info."""
  310. self._character_count += 1
  311. if (
  312. self._last_latin_character is not None
  313. and info.accentuated
  314. and self._last_was_accentuated
  315. ):
  316. if info.upper and self._last_latin_character.isupper():
  317. self._successive_count += 1
  318. if remove_accent(character) == remove_accent(self._last_latin_character):
  319. self._successive_count += 1
  320. self._last_latin_character = character
  321. self._last_was_accentuated = info.accentuated
  322. def reset(self) -> None: # Abstract
  323. self._successive_count = 0
  324. self._character_count = 0
  325. self._last_latin_character = None
  326. self._last_was_accentuated = False
  327. @property
  328. def ratio(self) -> float:
  329. if self._character_count == 0:
  330. return 0.0
  331. return (self._successive_count * 2) / self._character_count
  332. @final
  333. class SuspiciousRange(MessDetectorPlugin):
  334. __slots__ = (
  335. "_suspicious_successive_range_count",
  336. "_character_count",
  337. "_last_printable_seen",
  338. "_last_printable_range",
  339. )
  340. def __init__(self) -> None:
  341. self._suspicious_successive_range_count: int = 0
  342. self._character_count: int = 0
  343. self._last_printable_seen: str | None = None
  344. self._last_printable_range: str | None = None
  345. def feed_info(self, character: str, info: CharInfo) -> None:
  346. """Optimized feed using pre-computed character info."""
  347. self._character_count += 1
  348. if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:
  349. self._last_printable_seen = None
  350. self._last_printable_range = None
  351. return
  352. if self._last_printable_seen is None:
  353. self._last_printable_seen = character
  354. self._last_printable_range = unicode_range(character)
  355. return
  356. unicode_range_a: str | None = self._last_printable_range
  357. unicode_range_b: str | None = unicode_range(character)
  358. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  359. self._suspicious_successive_range_count += 1
  360. self._last_printable_seen = character
  361. self._last_printable_range = unicode_range_b
  362. def reset(self) -> None: # Abstract
  363. self._character_count = 0
  364. self._suspicious_successive_range_count = 0
  365. self._last_printable_seen = None
  366. self._last_printable_range = None
  367. @property
  368. def ratio(self) -> float:
  369. if self._character_count <= 13:
  370. return 0.0
  371. ratio_of_suspicious_range_usage: float = (
  372. self._suspicious_successive_range_count * 2
  373. ) / self._character_count
  374. return ratio_of_suspicious_range_usage
  375. @final
  376. class SuperWeirdWordPlugin(MessDetectorPlugin):
  377. __slots__ = (
  378. "_word_count",
  379. "_bad_word_count",
  380. "_foreign_long_count",
  381. "_is_current_word_bad",
  382. "_foreign_long_watch",
  383. "_character_count",
  384. "_bad_character_count",
  385. "_buffer_length",
  386. "_buffer_last_char",
  387. "_buffer_last_char_accentuated",
  388. "_buffer_accent_count",
  389. "_buffer_glyph_count",
  390. "_buffer_upper_count",
  391. )
  392. def __init__(self) -> None:
  393. self._word_count: int = 0
  394. self._bad_word_count: int = 0
  395. self._foreign_long_count: int = 0
  396. self._is_current_word_bad: bool = False
  397. self._foreign_long_watch: bool = False
  398. self._character_count: int = 0
  399. self._bad_character_count: int = 0
  400. self._buffer_length: int = 0
  401. self._buffer_last_char: str | None = None
  402. self._buffer_last_char_accentuated: bool = False
  403. self._buffer_accent_count: int = 0
  404. self._buffer_glyph_count: int = 0
  405. self._buffer_upper_count: int = 0
  406. def feed_info(self, character: str, info: CharInfo) -> None:
  407. """Optimized feed using pre-computed character info."""
  408. if info.alpha:
  409. self._buffer_length += 1
  410. self._buffer_last_char = character
  411. if info.upper:
  412. self._buffer_upper_count += 1
  413. self._buffer_last_char_accentuated = info.accentuated
  414. if info.accentuated:
  415. self._buffer_accent_count += 1
  416. if (
  417. not self._foreign_long_watch
  418. and (not info.latin or info.accentuated)
  419. and not info.is_glyph
  420. ):
  421. self._foreign_long_watch = True
  422. if info.is_glyph:
  423. self._buffer_glyph_count += 1
  424. return
  425. if not self._buffer_length:
  426. return
  427. if info.space or info.punct or is_separator(character):
  428. self._word_count += 1
  429. buffer_length: int = self._buffer_length
  430. self._character_count += buffer_length
  431. if buffer_length >= 4:
  432. if self._buffer_accent_count / buffer_length >= 0.5:
  433. self._is_current_word_bad = True
  434. elif (
  435. self._buffer_last_char_accentuated
  436. and self._buffer_last_char.isupper() # type: ignore[union-attr]
  437. and self._buffer_upper_count != buffer_length
  438. ):
  439. self._foreign_long_count += 1
  440. self._is_current_word_bad = True
  441. elif self._buffer_glyph_count == 1:
  442. self._is_current_word_bad = True
  443. self._foreign_long_count += 1
  444. if buffer_length >= 24 and self._foreign_long_watch:
  445. probable_camel_cased: bool = (
  446. self._buffer_upper_count > 0
  447. and self._buffer_upper_count / buffer_length <= 0.3
  448. )
  449. if not probable_camel_cased:
  450. self._foreign_long_count += 1
  451. self._is_current_word_bad = True
  452. if self._is_current_word_bad:
  453. self._bad_word_count += 1
  454. self._bad_character_count += buffer_length
  455. self._is_current_word_bad = False
  456. self._foreign_long_watch = False
  457. self._buffer_length = 0
  458. self._buffer_last_char = None
  459. self._buffer_last_char_accentuated = False
  460. self._buffer_accent_count = 0
  461. self._buffer_glyph_count = 0
  462. self._buffer_upper_count = 0
  463. elif (
  464. character not in {"<", ">", "-", "=", "~", "|", "_"}
  465. and not info.digit
  466. and info.sym
  467. ):
  468. self._is_current_word_bad = True
  469. self._buffer_length += 1
  470. self._buffer_last_char = character
  471. self._buffer_last_char_accentuated = False
  472. def reset(self) -> None: # Abstract
  473. self._buffer_length = 0
  474. self._buffer_last_char = None
  475. self._buffer_last_char_accentuated = False
  476. self._is_current_word_bad = False
  477. self._foreign_long_watch = False
  478. self._bad_word_count = 0
  479. self._word_count = 0
  480. self._character_count = 0
  481. self._bad_character_count = 0
  482. self._foreign_long_count = 0
  483. self._buffer_accent_count = 0
  484. self._buffer_glyph_count = 0
  485. self._buffer_upper_count = 0
  486. @property
  487. def ratio(self) -> float:
  488. if self._word_count <= 10 and self._foreign_long_count == 0:
  489. return 0.0
  490. return self._bad_character_count / self._character_count
  491. @final
  492. class CjkUncommonPlugin(MessDetectorPlugin):
  493. """
  494. Detect messy CJK text that probably means nothing.
  495. """
  496. __slots__ = ("_character_count", "_uncommon_count")
  497. def __init__(self) -> None:
  498. self._character_count: int = 0
  499. self._uncommon_count: int = 0
  500. def feed_info(self, character: str, info: CharInfo) -> None:
  501. """Optimized feed using pre-computed character info."""
  502. self._character_count += 1
  503. if character not in COMMON_CJK_CHARACTERS:
  504. self._uncommon_count += 1
  505. def reset(self) -> None: # Abstract
  506. self._character_count = 0
  507. self._uncommon_count = 0
  508. @property
  509. def ratio(self) -> float:
  510. if self._character_count < 8:
  511. return 0.0
  512. uncommon_form_usage: float = self._uncommon_count / self._character_count
  513. # we can be pretty sure it's garbage when uncommon characters are widely
  514. # used. otherwise it could just be traditional chinese for example.
  515. return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
  516. @final
  517. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  518. __slots__ = (
  519. "_buf",
  520. "_character_count_since_last_sep",
  521. "_successive_upper_lower_count",
  522. "_successive_upper_lower_count_final",
  523. "_character_count",
  524. "_last_alpha_seen",
  525. "_last_alpha_seen_upper",
  526. "_last_alpha_seen_lower",
  527. "_current_ascii_only",
  528. )
  529. def __init__(self) -> None:
  530. self._buf: bool = False
  531. self._character_count_since_last_sep: int = 0
  532. self._successive_upper_lower_count: int = 0
  533. self._successive_upper_lower_count_final: int = 0
  534. self._character_count: int = 0
  535. self._last_alpha_seen: str | None = None
  536. self._last_alpha_seen_upper: bool = False
  537. self._last_alpha_seen_lower: bool = False
  538. self._current_ascii_only: bool = True
  539. def feed_info(self, character: str, info: CharInfo) -> None:
  540. """Optimized feed using pre-computed character info."""
  541. is_concerned: bool = info.alpha and info.case_variable
  542. chunk_sep: bool = not is_concerned
  543. if chunk_sep and self._character_count_since_last_sep > 0:
  544. if (
  545. self._character_count_since_last_sep <= 64
  546. and not info.digit
  547. and not self._current_ascii_only
  548. ):
  549. self._successive_upper_lower_count_final += (
  550. self._successive_upper_lower_count
  551. )
  552. self._successive_upper_lower_count = 0
  553. self._character_count_since_last_sep = 0
  554. self._last_alpha_seen = None
  555. self._buf = False
  556. self._character_count += 1
  557. self._current_ascii_only = True
  558. return
  559. if self._current_ascii_only and not info.is_ascii:
  560. self._current_ascii_only = False
  561. if self._last_alpha_seen is not None:
  562. if (info.upper and self._last_alpha_seen_lower) or (
  563. info.lower and self._last_alpha_seen_upper
  564. ):
  565. if self._buf:
  566. self._successive_upper_lower_count += 2
  567. self._buf = False
  568. else:
  569. self._buf = True
  570. else:
  571. self._buf = False
  572. self._character_count += 1
  573. self._character_count_since_last_sep += 1
  574. self._last_alpha_seen = character
  575. self._last_alpha_seen_upper = info.upper
  576. self._last_alpha_seen_lower = info.lower
  577. def reset(self) -> None: # Abstract
  578. self._character_count = 0
  579. self._character_count_since_last_sep = 0
  580. self._successive_upper_lower_count = 0
  581. self._successive_upper_lower_count_final = 0
  582. self._last_alpha_seen = None
  583. self._last_alpha_seen_upper = False
  584. self._last_alpha_seen_lower = False
  585. self._buf = False
  586. self._current_ascii_only = True
  587. @property
  588. def ratio(self) -> float:
  589. if self._character_count == 0: # Defensive:
  590. return 0.0
  591. return self._successive_upper_lower_count_final / self._character_count
  592. @final
  593. class ArabicIsolatedFormPlugin(MessDetectorPlugin):
  594. __slots__ = ("_character_count", "_isolated_form_count")
  595. def __init__(self) -> None:
  596. self._character_count: int = 0
  597. self._isolated_form_count: int = 0
  598. def reset(self) -> None: # Abstract
  599. self._character_count = 0
  600. self._isolated_form_count = 0
  601. def feed_info(self, character: str, info: CharInfo) -> None:
  602. """Optimized feed using pre-computed character info."""
  603. self._character_count += 1
  604. if info.flags & _ARABIC_ISOLATED_FORM:
  605. self._isolated_form_count += 1
  606. @property
  607. def ratio(self) -> float:
  608. if self._character_count < 8:
  609. return 0.0
  610. isolated_form_usage: float = self._isolated_form_count / self._character_count
  611. return isolated_form_usage
  612. @lru_cache(maxsize=1024)
  613. def is_suspiciously_successive_range(
  614. unicode_range_a: str | None, unicode_range_b: str | None
  615. ) -> bool:
  616. """
  617. Determine if two Unicode range seen next to each other can be considered as suspicious.
  618. """
  619. if unicode_range_a is None or unicode_range_b is None:
  620. return True
  621. if unicode_range_a == unicode_range_b:
  622. return False
  623. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  624. return False
  625. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  626. return False
  627. # Latin characters can be accompanied with a combining diacritical mark
  628. # eg. Vietnamese.
  629. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  630. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  631. ):
  632. return False
  633. keywords_range_a, keywords_range_b = (
  634. unicode_range_a.split(" "),
  635. unicode_range_b.split(" "),
  636. )
  637. for el in keywords_range_a:
  638. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  639. continue
  640. if el in keywords_range_b:
  641. return False
  642. # Japanese Exception
  643. range_a_jp_chars, range_b_jp_chars = (
  644. unicode_range_a
  645. in (
  646. "Hiragana",
  647. "Katakana",
  648. ),
  649. unicode_range_b in ("Hiragana", "Katakana"),
  650. )
  651. if (range_a_jp_chars or range_b_jp_chars) and (
  652. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  653. ):
  654. return False
  655. if range_a_jp_chars and range_b_jp_chars:
  656. return False
  657. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  658. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  659. return False
  660. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  661. return False
  662. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  663. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  664. unicode_range_a in ["Katakana", "Hiragana"]
  665. and unicode_range_b in ["Katakana", "Hiragana"]
  666. ):
  667. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  668. return False
  669. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  670. return False
  671. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  672. return False
  673. return True
  674. @lru_cache(maxsize=2048)
  675. def mess_ratio(
  676. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  677. ) -> float:
  678. """
  679. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  680. """
  681. seq_len: int = len(decoded_sequence)
  682. if seq_len < 511:
  683. step: int = 32
  684. elif seq_len < 1024:
  685. step = 64
  686. else:
  687. step = 128
  688. # Create each detector as a named local variable (unrolled from the generic loop).
  689. # This eliminates per-character iteration over the detector list and
  690. # per-character eligible() virtual dispatch, while keeping every plugin class
  691. # intact and fully readable.
  692. d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()
  693. d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()
  694. d_up: UnprintablePlugin = UnprintablePlugin()
  695. d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()
  696. d_sr: SuspiciousRange = SuspiciousRange()
  697. d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()
  698. d_cu: CjkUncommonPlugin = CjkUncommonPlugin()
  699. d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()
  700. d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()
  701. # Local references for feed_info methods called in the hot loop.
  702. d_sp_feed = d_sp.feed_info
  703. d_ta_feed = d_ta.feed_info
  704. d_up_feed = d_up.feed_info
  705. d_sda_feed = d_sda.feed_info
  706. d_sr_feed = d_sr.feed_info
  707. d_sw_feed = d_sw.feed_info
  708. d_cu_feed = d_cu.feed_info
  709. d_au_feed = d_au.feed_info
  710. d_ai_feed = d_ai.feed_info
  711. # Single reusable CharInfo object (avoids per-character allocation).
  712. info: CharInfo = CharInfo()
  713. info_update = info.update
  714. mean_mess_ratio: float
  715. for block_start in range(0, seq_len, step):
  716. for character in decoded_sequence[block_start : block_start + step]:
  717. # Pre-compute all character properties once (shared across all plugins).
  718. info_update(character)
  719. # Detectors with eligible() == always True
  720. d_up_feed(character, info)
  721. d_sw_feed(character, info)
  722. d_au_feed(character, info)
  723. # Detectors with eligible() == isprintable
  724. if info.printable:
  725. d_sp_feed(character, info)
  726. d_sr_feed(character, info)
  727. # Detectors with eligible() == isalpha
  728. if info.alpha:
  729. d_ta_feed(character, info)
  730. # SuspiciousDuplicateAccent: isalpha() and is_latin()
  731. if info.latin:
  732. d_sda_feed(character, info)
  733. # CjkUncommon: is_cjk()
  734. if info.is_cjk:
  735. d_cu_feed(character, info)
  736. # ArabicIsolatedForm: is_arabic()
  737. if info.is_arabic:
  738. d_ai_feed(character, info)
  739. mean_mess_ratio = (
  740. d_sp.ratio
  741. + d_ta.ratio
  742. + d_up.ratio
  743. + d_sda.ratio
  744. + d_sr.ratio
  745. + d_sw.ratio
  746. + d_cu.ratio
  747. + d_au.ratio
  748. + d_ai.ratio
  749. )
  750. if mean_mess_ratio >= maximum_threshold:
  751. break
  752. else:
  753. # Flush last word buffer in SuperWeirdWordPlugin via trailing newline.
  754. info_update("\n")
  755. d_sw_feed("\n", info)
  756. d_au_feed("\n", info)
  757. d_up_feed("\n", info)
  758. mean_mess_ratio = (
  759. d_sp.ratio
  760. + d_ta.ratio
  761. + d_up.ratio
  762. + d_sda.ratio
  763. + d_sr.ratio
  764. + d_sw.ratio
  765. + d_cu.ratio
  766. + d_au.ratio
  767. + d_ai.ratio
  768. )
  769. if debug: # Defensive:
  770. logger = getLogger("charset_normalizer")
  771. logger.log(
  772. TRACE,
  773. "Mess-detector extended-analysis start. "
  774. f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "
  775. f"maximum_threshold={maximum_threshold}",
  776. )
  777. if seq_len > 16:
  778. logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
  779. logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
  780. for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:
  781. logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
  782. return round(mean_mess_ratio, 3)