| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936 |
- from __future__ import annotations
- import sys
- from functools import lru_cache
- from logging import getLogger
- if sys.version_info >= (3, 8):
- from typing import final
- else:
- try:
- from typing_extensions import final
- except ImportError:
- def final(cls): # type: ignore[misc,no-untyped-def]
- return cls
- from .constant import (
- COMMON_CJK_CHARACTERS,
- COMMON_SAFE_ASCII_CHARACTERS,
- TRACE,
- UNICODE_SECONDARY_RANGE_KEYWORD,
- _ACCENTUATED,
- _ARABIC,
- _ARABIC_ISOLATED_FORM,
- _CJK,
- _HANGUL,
- _HIRAGANA,
- _KATAKANA,
- _LATIN,
- _THAI,
- )
- from .utils import (
- _character_flags,
- is_emoticon,
- is_punctuation,
- is_separator,
- is_symbol,
- remove_accent,
- unicode_range,
- )
- # Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.
- _GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI
- @final
- class CharInfo:
- """Pre-computed character properties shared across all detectors.
- Instantiated once and reused via :meth:`update` on every character
- in the hot loop so that redundant calls to str methods
- (``isalpha``, ``isupper``, …) and cached utility functions
- (``_character_flags``, ``is_punctuation``, …) are avoided when
- several plugins need the same information.
- """
- __slots__ = (
- "character",
- "printable",
- "alpha",
- "upper",
- "lower",
- "space",
- "digit",
- "is_ascii",
- "case_variable",
- "flags",
- "accentuated",
- "latin",
- "is_cjk",
- "is_arabic",
- "is_glyph",
- "punct",
- "sym",
- )
- def __init__(self) -> None:
- self.character: str = ""
- self.printable: bool = False
- self.alpha: bool = False
- self.upper: bool = False
- self.lower: bool = False
- self.space: bool = False
- self.digit: bool = False
- self.is_ascii: bool = False
- self.case_variable: bool = False
- self.flags: int = 0
- self.accentuated: bool = False
- self.latin: bool = False
- self.is_cjk: bool = False
- self.is_arabic: bool = False
- self.is_glyph: bool = False
- self.punct: bool = False
- self.sym: bool = False
- def update(self, character: str) -> None:
- """Update all properties for *character* (called once per character)."""
- self.character = character
- # ASCII fast-path: for characters with ord < 128, we can skip
- # _character_flags() entirely and derive most properties from ord.
- o: int = ord(character)
- if o < 128:
- self.is_ascii = True
- self.accentuated = False
- self.is_cjk = False
- self.is_arabic = False
- self.is_glyph = False
- # ASCII alpha: a-z (97-122) or A-Z (65-90)
- if 65 <= o <= 90:
- # Uppercase ASCII letter
- self.alpha = True
- self.upper = True
- self.lower = False
- self.space = False
- self.digit = False
- self.printable = True
- self.case_variable = True
- self.flags = _LATIN
- self.latin = True
- self.punct = False
- self.sym = False
- elif 97 <= o <= 122:
- # Lowercase ASCII letter
- self.alpha = True
- self.upper = False
- self.lower = True
- self.space = False
- self.digit = False
- self.printable = True
- self.case_variable = True
- self.flags = _LATIN
- self.latin = True
- self.punct = False
- self.sym = False
- elif 48 <= o <= 57:
- # ASCII digit 0-9
- self.alpha = False
- self.upper = False
- self.lower = False
- self.space = False
- self.digit = True
- self.printable = True
- self.case_variable = False
- self.flags = 0
- self.latin = False
- self.punct = False
- self.sym = False
- elif o == 32 or (9 <= o <= 13):
- # Space, tab, newline, etc.
- self.alpha = False
- self.upper = False
- self.lower = False
- self.space = True
- self.digit = False
- self.printable = o == 32
- self.case_variable = False
- self.flags = 0
- self.latin = False
- self.punct = False
- self.sym = False
- else:
- # Other ASCII (punctuation, symbols, control chars)
- self.printable = character.isprintable()
- self.alpha = False
- self.upper = False
- self.lower = False
- self.space = False
- self.digit = False
- self.case_variable = False
- self.flags = 0
- self.latin = False
- self.punct = is_punctuation(character) if self.printable else False
- self.sym = is_symbol(character) if self.printable else False
- else:
- # Non-ASCII path
- self.is_ascii = False
- self.printable = character.isprintable()
- self.alpha = character.isalpha()
- self.upper = character.isupper()
- self.lower = character.islower()
- self.space = character.isspace()
- self.digit = character.isdigit()
- self.case_variable = self.lower != self.upper
- # Flag-based classification (single unicodedata.name() call, lru-cached)
- flags: int
- if self.alpha:
- flags = _character_flags(character)
- else:
- flags = 0
- self.flags = flags
- self.accentuated = bool(flags & _ACCENTUATED)
- self.latin = bool(flags & _LATIN)
- self.is_cjk = bool(flags & _CJK)
- self.is_arabic = bool(flags & _ARABIC)
- self.is_glyph = bool(flags & _GLYPH_MASK)
- # Eagerly compute punct and sym (avoids property dispatch overhead
- # on 300K+ accesses in the hot loop).
- self.punct = is_punctuation(character) if self.printable else False
- self.sym = is_symbol(character) if self.printable else False
- class MessDetectorPlugin:
- """
- Base abstract class used for mess detection plugins.
- All detectors MUST extend and implement given methods.
- """
- __slots__ = ()
- def feed_info(self, character: str, info: CharInfo) -> None:
- """
- The main routine to be executed upon character.
- Insert the logic in witch the text would be considered chaotic.
- """
- raise NotImplementedError # Defensive:
- def reset(self) -> None: # Defensive:
- """
- Permit to reset the plugin to the initial state.
- """
- raise NotImplementedError
- @property
- def ratio(self) -> float:
- """
- Compute the chaos ratio based on what your feed() has seen.
- Must NOT be lower than 0.; No restriction gt 0.
- """
- raise NotImplementedError # Defensive:
- @final
- class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
- __slots__ = (
- "_punctuation_count",
- "_symbol_count",
- "_character_count",
- "_last_printable_char",
- "_frenzy_symbol_in_word",
- )
- def __init__(self) -> None:
- self._punctuation_count: int = 0
- self._symbol_count: int = 0
- self._character_count: int = 0
- self._last_printable_char: str | None = None
- self._frenzy_symbol_in_word: bool = False
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- self._character_count += 1
- if (
- character != self._last_printable_char
- and character not in COMMON_SAFE_ASCII_CHARACTERS
- ):
- if info.punct:
- self._punctuation_count += 1
- elif not info.digit and info.sym and not is_emoticon(character):
- self._symbol_count += 2
- self._last_printable_char = character
- def reset(self) -> None: # Abstract
- self._punctuation_count = 0
- self._character_count = 0
- self._symbol_count = 0
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
- ratio_of_punctuation: float = (
- self._punctuation_count + self._symbol_count
- ) / self._character_count
- return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
- @final
- class TooManyAccentuatedPlugin(MessDetectorPlugin):
- __slots__ = ("_character_count", "_accentuated_count")
- def __init__(self) -> None:
- self._character_count: int = 0
- self._accentuated_count: int = 0
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- self._character_count += 1
- if info.accentuated:
- self._accentuated_count += 1
- def reset(self) -> None: # Abstract
- self._character_count = 0
- self._accentuated_count = 0
- @property
- def ratio(self) -> float:
- if self._character_count < 8:
- return 0.0
- ratio_of_accentuation: float = self._accentuated_count / self._character_count
- return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
- @final
- class UnprintablePlugin(MessDetectorPlugin):
- __slots__ = ("_unprintable_count", "_character_count")
- def __init__(self) -> None:
- self._unprintable_count: int = 0
- self._character_count: int = 0
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- if (
- not info.space
- and not info.printable
- and character != "\x1a"
- and character != "\ufeff"
- ):
- self._unprintable_count += 1
- self._character_count += 1
- def reset(self) -> None: # Abstract
- self._unprintable_count = 0
- @property
- def ratio(self) -> float:
- if self._character_count == 0: # Defensive:
- return 0.0
- return (self._unprintable_count * 8) / self._character_count
- @final
- class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
- __slots__ = (
- "_successive_count",
- "_character_count",
- "_last_latin_character",
- "_last_was_accentuated",
- )
- def __init__(self) -> None:
- self._successive_count: int = 0
- self._character_count: int = 0
- self._last_latin_character: str | None = None
- self._last_was_accentuated: bool = False
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- self._character_count += 1
- if (
- self._last_latin_character is not None
- and info.accentuated
- and self._last_was_accentuated
- ):
- if info.upper and self._last_latin_character.isupper():
- self._successive_count += 1
- if remove_accent(character) == remove_accent(self._last_latin_character):
- self._successive_count += 1
- self._last_latin_character = character
- self._last_was_accentuated = info.accentuated
- def reset(self) -> None: # Abstract
- self._successive_count = 0
- self._character_count = 0
- self._last_latin_character = None
- self._last_was_accentuated = False
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
- return (self._successive_count * 2) / self._character_count
- @final
- class SuspiciousRange(MessDetectorPlugin):
- __slots__ = (
- "_suspicious_successive_range_count",
- "_character_count",
- "_last_printable_seen",
- "_last_printable_range",
- )
- def __init__(self) -> None:
- self._suspicious_successive_range_count: int = 0
- self._character_count: int = 0
- self._last_printable_seen: str | None = None
- self._last_printable_range: str | None = None
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- self._character_count += 1
- if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:
- self._last_printable_seen = None
- self._last_printable_range = None
- return
- if self._last_printable_seen is None:
- self._last_printable_seen = character
- self._last_printable_range = unicode_range(character)
- return
- unicode_range_a: str | None = self._last_printable_range
- unicode_range_b: str | None = unicode_range(character)
- if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
- self._suspicious_successive_range_count += 1
- self._last_printable_seen = character
- self._last_printable_range = unicode_range_b
- def reset(self) -> None: # Abstract
- self._character_count = 0
- self._suspicious_successive_range_count = 0
- self._last_printable_seen = None
- self._last_printable_range = None
- @property
- def ratio(self) -> float:
- if self._character_count <= 13:
- return 0.0
- ratio_of_suspicious_range_usage: float = (
- self._suspicious_successive_range_count * 2
- ) / self._character_count
- return ratio_of_suspicious_range_usage
- @final
- class SuperWeirdWordPlugin(MessDetectorPlugin):
- __slots__ = (
- "_word_count",
- "_bad_word_count",
- "_foreign_long_count",
- "_is_current_word_bad",
- "_foreign_long_watch",
- "_character_count",
- "_bad_character_count",
- "_buffer_length",
- "_buffer_last_char",
- "_buffer_last_char_accentuated",
- "_buffer_accent_count",
- "_buffer_glyph_count",
- "_buffer_upper_count",
- )
- def __init__(self) -> None:
- self._word_count: int = 0
- self._bad_word_count: int = 0
- self._foreign_long_count: int = 0
- self._is_current_word_bad: bool = False
- self._foreign_long_watch: bool = False
- self._character_count: int = 0
- self._bad_character_count: int = 0
- self._buffer_length: int = 0
- self._buffer_last_char: str | None = None
- self._buffer_last_char_accentuated: bool = False
- self._buffer_accent_count: int = 0
- self._buffer_glyph_count: int = 0
- self._buffer_upper_count: int = 0
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- if info.alpha:
- self._buffer_length += 1
- self._buffer_last_char = character
- if info.upper:
- self._buffer_upper_count += 1
- self._buffer_last_char_accentuated = info.accentuated
- if info.accentuated:
- self._buffer_accent_count += 1
- if (
- not self._foreign_long_watch
- and (not info.latin or info.accentuated)
- and not info.is_glyph
- ):
- self._foreign_long_watch = True
- if info.is_glyph:
- self._buffer_glyph_count += 1
- return
- if not self._buffer_length:
- return
- if info.space or info.punct or is_separator(character):
- self._word_count += 1
- buffer_length: int = self._buffer_length
- self._character_count += buffer_length
- if buffer_length >= 4:
- if self._buffer_accent_count / buffer_length >= 0.5:
- self._is_current_word_bad = True
- elif (
- self._buffer_last_char_accentuated
- and self._buffer_last_char.isupper() # type: ignore[union-attr]
- and self._buffer_upper_count != buffer_length
- ):
- self._foreign_long_count += 1
- self._is_current_word_bad = True
- elif self._buffer_glyph_count == 1:
- self._is_current_word_bad = True
- self._foreign_long_count += 1
- if buffer_length >= 24 and self._foreign_long_watch:
- probable_camel_cased: bool = (
- self._buffer_upper_count > 0
- and self._buffer_upper_count / buffer_length <= 0.3
- )
- if not probable_camel_cased:
- self._foreign_long_count += 1
- self._is_current_word_bad = True
- if self._is_current_word_bad:
- self._bad_word_count += 1
- self._bad_character_count += buffer_length
- self._is_current_word_bad = False
- self._foreign_long_watch = False
- self._buffer_length = 0
- self._buffer_last_char = None
- self._buffer_last_char_accentuated = False
- self._buffer_accent_count = 0
- self._buffer_glyph_count = 0
- self._buffer_upper_count = 0
- elif (
- character not in {"<", ">", "-", "=", "~", "|", "_"}
- and not info.digit
- and info.sym
- ):
- self._is_current_word_bad = True
- self._buffer_length += 1
- self._buffer_last_char = character
- self._buffer_last_char_accentuated = False
- def reset(self) -> None: # Abstract
- self._buffer_length = 0
- self._buffer_last_char = None
- self._buffer_last_char_accentuated = False
- self._is_current_word_bad = False
- self._foreign_long_watch = False
- self._bad_word_count = 0
- self._word_count = 0
- self._character_count = 0
- self._bad_character_count = 0
- self._foreign_long_count = 0
- self._buffer_accent_count = 0
- self._buffer_glyph_count = 0
- self._buffer_upper_count = 0
- @property
- def ratio(self) -> float:
- if self._word_count <= 10 and self._foreign_long_count == 0:
- return 0.0
- return self._bad_character_count / self._character_count
- @final
- class CjkUncommonPlugin(MessDetectorPlugin):
- """
- Detect messy CJK text that probably means nothing.
- """
- __slots__ = ("_character_count", "_uncommon_count")
- def __init__(self) -> None:
- self._character_count: int = 0
- self._uncommon_count: int = 0
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- self._character_count += 1
- if character not in COMMON_CJK_CHARACTERS:
- self._uncommon_count += 1
- def reset(self) -> None: # Abstract
- self._character_count = 0
- self._uncommon_count = 0
- @property
- def ratio(self) -> float:
- if self._character_count < 8:
- return 0.0
- uncommon_form_usage: float = self._uncommon_count / self._character_count
- # we can be pretty sure it's garbage when uncommon characters are widely
- # used. otherwise it could just be traditional chinese for example.
- return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
- @final
- class ArchaicUpperLowerPlugin(MessDetectorPlugin):
- __slots__ = (
- "_buf",
- "_character_count_since_last_sep",
- "_successive_upper_lower_count",
- "_successive_upper_lower_count_final",
- "_character_count",
- "_last_alpha_seen",
- "_last_alpha_seen_upper",
- "_last_alpha_seen_lower",
- "_current_ascii_only",
- )
- def __init__(self) -> None:
- self._buf: bool = False
- self._character_count_since_last_sep: int = 0
- self._successive_upper_lower_count: int = 0
- self._successive_upper_lower_count_final: int = 0
- self._character_count: int = 0
- self._last_alpha_seen: str | None = None
- self._last_alpha_seen_upper: bool = False
- self._last_alpha_seen_lower: bool = False
- self._current_ascii_only: bool = True
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- is_concerned: bool = info.alpha and info.case_variable
- chunk_sep: bool = not is_concerned
- if chunk_sep and self._character_count_since_last_sep > 0:
- if (
- self._character_count_since_last_sep <= 64
- and not info.digit
- and not self._current_ascii_only
- ):
- self._successive_upper_lower_count_final += (
- self._successive_upper_lower_count
- )
- self._successive_upper_lower_count = 0
- self._character_count_since_last_sep = 0
- self._last_alpha_seen = None
- self._buf = False
- self._character_count += 1
- self._current_ascii_only = True
- return
- if self._current_ascii_only and not info.is_ascii:
- self._current_ascii_only = False
- if self._last_alpha_seen is not None:
- if (info.upper and self._last_alpha_seen_lower) or (
- info.lower and self._last_alpha_seen_upper
- ):
- if self._buf:
- self._successive_upper_lower_count += 2
- self._buf = False
- else:
- self._buf = True
- else:
- self._buf = False
- self._character_count += 1
- self._character_count_since_last_sep += 1
- self._last_alpha_seen = character
- self._last_alpha_seen_upper = info.upper
- self._last_alpha_seen_lower = info.lower
- def reset(self) -> None: # Abstract
- self._character_count = 0
- self._character_count_since_last_sep = 0
- self._successive_upper_lower_count = 0
- self._successive_upper_lower_count_final = 0
- self._last_alpha_seen = None
- self._last_alpha_seen_upper = False
- self._last_alpha_seen_lower = False
- self._buf = False
- self._current_ascii_only = True
- @property
- def ratio(self) -> float:
- if self._character_count == 0: # Defensive:
- return 0.0
- return self._successive_upper_lower_count_final / self._character_count
- @final
- class ArabicIsolatedFormPlugin(MessDetectorPlugin):
- __slots__ = ("_character_count", "_isolated_form_count")
- def __init__(self) -> None:
- self._character_count: int = 0
- self._isolated_form_count: int = 0
- def reset(self) -> None: # Abstract
- self._character_count = 0
- self._isolated_form_count = 0
- def feed_info(self, character: str, info: CharInfo) -> None:
- """Optimized feed using pre-computed character info."""
- self._character_count += 1
- if info.flags & _ARABIC_ISOLATED_FORM:
- self._isolated_form_count += 1
- @property
- def ratio(self) -> float:
- if self._character_count < 8:
- return 0.0
- isolated_form_usage: float = self._isolated_form_count / self._character_count
- return isolated_form_usage
- @lru_cache(maxsize=1024)
- def is_suspiciously_successive_range(
- unicode_range_a: str | None, unicode_range_b: str | None
- ) -> bool:
- """
- Determine if two Unicode range seen next to each other can be considered as suspicious.
- """
- if unicode_range_a is None or unicode_range_b is None:
- return True
- if unicode_range_a == unicode_range_b:
- return False
- if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
- return False
- if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
- return False
- # Latin characters can be accompanied with a combining diacritical mark
- # eg. Vietnamese.
- if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
- "Combining" in unicode_range_a or "Combining" in unicode_range_b
- ):
- return False
- keywords_range_a, keywords_range_b = (
- unicode_range_a.split(" "),
- unicode_range_b.split(" "),
- )
- for el in keywords_range_a:
- if el in UNICODE_SECONDARY_RANGE_KEYWORD:
- continue
- if el in keywords_range_b:
- return False
- # Japanese Exception
- range_a_jp_chars, range_b_jp_chars = (
- unicode_range_a
- in (
- "Hiragana",
- "Katakana",
- ),
- unicode_range_b in ("Hiragana", "Katakana"),
- )
- if (range_a_jp_chars or range_b_jp_chars) and (
- "CJK" in unicode_range_a or "CJK" in unicode_range_b
- ):
- return False
- if range_a_jp_chars and range_b_jp_chars:
- return False
- if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
- if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
- return False
- if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
- return False
- # Chinese/Japanese use dedicated range for punctuation and/or separators.
- if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
- unicode_range_a in ["Katakana", "Hiragana"]
- and unicode_range_b in ["Katakana", "Hiragana"]
- ):
- if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
- return False
- if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
- return False
- if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
- return False
- return True
- @lru_cache(maxsize=2048)
- def mess_ratio(
- decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
- ) -> float:
- """
- Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
- """
- seq_len: int = len(decoded_sequence)
- if seq_len < 511:
- step: int = 32
- elif seq_len < 1024:
- step = 64
- else:
- step = 128
- # Create each detector as a named local variable (unrolled from the generic loop).
- # This eliminates per-character iteration over the detector list and
- # per-character eligible() virtual dispatch, while keeping every plugin class
- # intact and fully readable.
- d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()
- d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()
- d_up: UnprintablePlugin = UnprintablePlugin()
- d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()
- d_sr: SuspiciousRange = SuspiciousRange()
- d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()
- d_cu: CjkUncommonPlugin = CjkUncommonPlugin()
- d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()
- d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()
- # Local references for feed_info methods called in the hot loop.
- d_sp_feed = d_sp.feed_info
- d_ta_feed = d_ta.feed_info
- d_up_feed = d_up.feed_info
- d_sda_feed = d_sda.feed_info
- d_sr_feed = d_sr.feed_info
- d_sw_feed = d_sw.feed_info
- d_cu_feed = d_cu.feed_info
- d_au_feed = d_au.feed_info
- d_ai_feed = d_ai.feed_info
- # Single reusable CharInfo object (avoids per-character allocation).
- info: CharInfo = CharInfo()
- info_update = info.update
- mean_mess_ratio: float
- for block_start in range(0, seq_len, step):
- for character in decoded_sequence[block_start : block_start + step]:
- # Pre-compute all character properties once (shared across all plugins).
- info_update(character)
- # Detectors with eligible() == always True
- d_up_feed(character, info)
- d_sw_feed(character, info)
- d_au_feed(character, info)
- # Detectors with eligible() == isprintable
- if info.printable:
- d_sp_feed(character, info)
- d_sr_feed(character, info)
- # Detectors with eligible() == isalpha
- if info.alpha:
- d_ta_feed(character, info)
- # SuspiciousDuplicateAccent: isalpha() and is_latin()
- if info.latin:
- d_sda_feed(character, info)
- # CjkUncommon: is_cjk()
- if info.is_cjk:
- d_cu_feed(character, info)
- # ArabicIsolatedForm: is_arabic()
- if info.is_arabic:
- d_ai_feed(character, info)
- mean_mess_ratio = (
- d_sp.ratio
- + d_ta.ratio
- + d_up.ratio
- + d_sda.ratio
- + d_sr.ratio
- + d_sw.ratio
- + d_cu.ratio
- + d_au.ratio
- + d_ai.ratio
- )
- if mean_mess_ratio >= maximum_threshold:
- break
- else:
- # Flush last word buffer in SuperWeirdWordPlugin via trailing newline.
- info_update("\n")
- d_sw_feed("\n", info)
- d_au_feed("\n", info)
- d_up_feed("\n", info)
- mean_mess_ratio = (
- d_sp.ratio
- + d_ta.ratio
- + d_up.ratio
- + d_sda.ratio
- + d_sr.ratio
- + d_sw.ratio
- + d_cu.ratio
- + d_au.ratio
- + d_ai.ratio
- )
- if debug: # Defensive:
- logger = getLogger("charset_normalizer")
- logger.log(
- TRACE,
- "Mess-detector extended-analysis start. "
- f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "
- f"maximum_threshold={maximum_threshold}",
- )
- if seq_len > 16:
- logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
- logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
- for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:
- logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
- return round(mean_mess_ratio, 3)
|