| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454 |
- from __future__ import annotations
- import importlib
- from codecs import IncrementalDecoder
- from collections import Counter
- from functools import lru_cache
- from typing import Counter as TypeCounter
- from .constant import (
- FREQUENCIES,
- KO_NAMES,
- LANGUAGE_SUPPORTED_COUNT,
- TOO_SMALL_SEQUENCE,
- ZH_NAMES,
- _FREQUENCIES_SET,
- _FREQUENCIES_RANK,
- )
- from .md import is_suspiciously_successive_range
- from .models import CoherenceMatches
- from .utils import (
- is_accentuated,
- is_latin,
- is_multi_byte_encoding,
- is_unicode_range_secondary,
- unicode_range,
- )
- def encoding_unicode_range(iana_name: str) -> list[str]:
- """
- Return associated unicode ranges in a single byte code page.
- """
- if is_multi_byte_encoding(iana_name):
- raise OSError( # Defensive:
- "Function not supported on multi-byte code page"
- )
- decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
- p: IncrementalDecoder = decoder(errors="ignore")
- seen_ranges: dict[str, int] = {}
- character_count: int = 0
- for i in range(0x40, 0xFF):
- chunk: str = p.decode(bytes([i]))
- if chunk:
- character_range: str | None = unicode_range(chunk)
- if character_range is None:
- continue
- if is_unicode_range_secondary(character_range) is False:
- if character_range not in seen_ranges:
- seen_ranges[character_range] = 0
- seen_ranges[character_range] += 1
- character_count += 1
- return sorted(
- [
- character_range
- for character_range in seen_ranges
- if seen_ranges[character_range] / character_count >= 0.15
- ]
- )
- def unicode_range_languages(primary_range: str) -> list[str]:
- """
- Return inferred languages used with a unicode range.
- """
- languages: list[str] = []
- for language, characters in FREQUENCIES.items():
- for character in characters:
- if unicode_range(character) == primary_range:
- languages.append(language)
- break
- return languages
- @lru_cache()
- def encoding_languages(iana_name: str) -> list[str]:
- """
- Single-byte encoding language association. Some code page are heavily linked to particular language(s).
- This function does the correspondence.
- """
- unicode_ranges: list[str] = encoding_unicode_range(iana_name)
- primary_range: str | None = None
- for specified_range in unicode_ranges:
- if "Latin" not in specified_range:
- primary_range = specified_range
- break
- if primary_range is None:
- return ["Latin Based"]
- return unicode_range_languages(primary_range)
- @lru_cache()
- def mb_encoding_languages(iana_name: str) -> list[str]:
- """
- Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
- This function does the correspondence.
- """
- if (
- iana_name.startswith("shift_")
- or iana_name.startswith("iso2022_jp")
- or iana_name.startswith("euc_j")
- or iana_name == "cp932"
- ):
- return ["Japanese"]
- if iana_name.startswith("gb") or iana_name in ZH_NAMES:
- return ["Chinese"]
- if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
- return ["Korean"]
- return []
- @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
- def get_target_features(language: str) -> tuple[bool, bool]:
- """
- Determine main aspects from a supported language if it contains accents and if is pure Latin.
- """
- target_have_accents: bool = False
- target_pure_latin: bool = True
- for character in FREQUENCIES[language]:
- if not target_have_accents and is_accentuated(character):
- target_have_accents = True
- if target_pure_latin and is_latin(character) is False:
- target_pure_latin = False
- return target_have_accents, target_pure_latin
- def alphabet_languages(
- characters: list[str], ignore_non_latin: bool = False
- ) -> list[str]:
- """
- Return associated languages associated to given characters.
- """
- languages: list[tuple[str, float]] = []
- characters_set: frozenset[str] = frozenset(characters)
- source_have_accents = any(is_accentuated(character) for character in characters)
- for language, language_characters in FREQUENCIES.items():
- target_have_accents, target_pure_latin = get_target_features(language)
- if ignore_non_latin and target_pure_latin is False:
- continue
- if target_have_accents is False and source_have_accents:
- continue
- character_count: int = len(language_characters)
- character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)
- ratio: float = character_match_count / character_count
- if ratio >= 0.2:
- languages.append((language, ratio))
- languages = sorted(languages, key=lambda x: x[1], reverse=True)
- return [compatible_language[0] for compatible_language in languages]
- def characters_popularity_compare(
- language: str, ordered_characters: list[str]
- ) -> float:
- """
- Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
- The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
- Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
- """
- if language not in FREQUENCIES:
- raise ValueError(f"{language} not available") # Defensive:
- character_approved_count: int = 0
- frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
- lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]
- ordered_characters_count: int = len(ordered_characters)
- target_language_characters_count: int = len(FREQUENCIES[language])
- large_alphabet: bool = target_language_characters_count > 26
- expected_projection_ratio: float = (
- target_language_characters_count / ordered_characters_count
- )
- # Pre-built rank dict for ordered_characters (avoids repeated list slicing).
- ordered_rank: dict[str, int] = {
- char: rank for rank, char in enumerate(ordered_characters)
- }
- # Pre-compute characters common to both orderings.
- # Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
- common_chars: list[tuple[int, int]] = [
- (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
- ]
- # Pre-extract lr and orr arrays for faster iteration in the inner loop.
- # Plain integer loops with local arrays are much faster under mypyc than
- # generator expression sums over a list of tuples.
- common_count: int = len(common_chars)
- common_lr: list[int] = [p[0] for p in common_chars]
- common_orr: list[int] = [p[1] for p in common_chars]
- for character, character_rank in zip(
- ordered_characters, range(0, ordered_characters_count)
- ):
- if character not in frequencies_language_set:
- continue
- character_rank_in_language: int = lang_rank[character]
- character_rank_projection: int = int(character_rank * expected_projection_ratio)
- if (
- large_alphabet is False
- and abs(character_rank_projection - character_rank_in_language) > 4
- ):
- continue
- if (
- large_alphabet is True
- and abs(character_rank_projection - character_rank_in_language)
- < target_language_characters_count / 3
- ):
- character_approved_count += 1
- continue
- # Count how many characters appear "before" in both orderings,
- # and how many appear "at or after" in both orderings.
- # Single pass over pre-extracted arrays — much faster under mypyc
- # than two generator expression sums.
- before_match_count: int = 0
- after_match_count: int = 0
- for i in range(common_count):
- lr_i: int = common_lr[i]
- orr_i: int = common_orr[i]
- if lr_i < character_rank_in_language:
- if orr_i < character_rank:
- before_match_count += 1
- else:
- if orr_i >= character_rank:
- after_match_count += 1
- after_len: int = target_language_characters_count - character_rank_in_language
- if character_rank_in_language == 0 and before_match_count <= 4:
- character_approved_count += 1
- continue
- if after_len == 0 and after_match_count <= 4:
- character_approved_count += 1
- continue
- if (
- character_rank_in_language > 0
- and before_match_count / character_rank_in_language >= 0.4
- ) or (after_len > 0 and after_match_count / after_len >= 0.4):
- character_approved_count += 1
- continue
- return character_approved_count / len(ordered_characters)
- def alpha_unicode_split(decoded_sequence: str) -> list[str]:
- """
- Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
- Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
- One containing the latin letters and the other hebrew.
- """
- layers: dict[str, list[str]] = {}
- # Fast path: track single-layer key to skip dict iteration for single-script text.
- single_layer_key: str | None = None
- multi_layer: bool = False
- # Cache the last character_range and its resolved layer to avoid repeated
- # is_suspiciously_successive_range calls for consecutive same-range chars.
- prev_character_range: str | None = None
- prev_layer_target: str | None = None
- for character in decoded_sequence:
- if character.isalpha() is False:
- continue
- # ASCII fast-path: a-z and A-Z are always "Basic Latin".
- # Avoids unicode_range() function call overhead for the most common case.
- character_ord: int = ord(character)
- if character_ord < 128:
- character_range: str | None = "Basic Latin"
- else:
- character_range = unicode_range(character)
- if character_range is None:
- continue
- # Fast path: same range as previous character → reuse cached layer target.
- if character_range == prev_character_range:
- if prev_layer_target is not None:
- layers[prev_layer_target].append(character)
- continue
- layer_target_range: str | None = None
- if multi_layer:
- for discovered_range in layers:
- if (
- is_suspiciously_successive_range(discovered_range, character_range)
- is False
- ):
- layer_target_range = discovered_range
- break
- elif single_layer_key is not None:
- if (
- is_suspiciously_successive_range(single_layer_key, character_range)
- is False
- ):
- layer_target_range = single_layer_key
- if layer_target_range is None:
- layer_target_range = character_range
- if layer_target_range not in layers:
- layers[layer_target_range] = []
- if single_layer_key is None:
- single_layer_key = layer_target_range
- else:
- multi_layer = True
- layers[layer_target_range].append(character)
- # Cache for next iteration
- prev_character_range = character_range
- prev_layer_target = layer_target_range
- return ["".join(chars).lower() for chars in layers.values()]
- def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
- """
- This function merge results previously given by the function coherence_ratio.
- The return type is the same as coherence_ratio.
- """
- per_language_ratios: dict[str, list[float]] = {}
- for result in results:
- for sub_result in result:
- language, ratio = sub_result
- if language not in per_language_ratios:
- per_language_ratios[language] = [ratio]
- continue
- per_language_ratios[language].append(ratio)
- merge = [
- (
- language,
- round(
- sum(per_language_ratios[language]) / len(per_language_ratios[language]),
- 4,
- ),
- )
- for language in per_language_ratios
- ]
- return sorted(merge, key=lambda x: x[1], reverse=True)
- def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
- """
- We shall NOT return "English—" in CoherenceMatches because it is an alternative
- of "English". This function only keeps the best match and remove the em-dash in it.
- """
- index_results: dict[str, list[float]] = dict()
- for result in results:
- language, ratio = result
- no_em_name: str = language.replace("—", "")
- if no_em_name not in index_results:
- index_results[no_em_name] = []
- index_results[no_em_name].append(ratio)
- if any(len(index_results[e]) > 1 for e in index_results):
- filtered_results: CoherenceMatches = []
- for language in index_results:
- filtered_results.append((language, max(index_results[language])))
- return filtered_results
- return results
- @lru_cache(maxsize=2048)
- def coherence_ratio(
- decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
- ) -> CoherenceMatches:
- """
- Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
- A layer = Character extraction by alphabets/ranges.
- """
- results: list[tuple[str, float]] = []
- ignore_non_latin: bool = False
- sufficient_match_count: int = 0
- lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
- if "Latin Based" in lg_inclusion_list:
- ignore_non_latin = True
- lg_inclusion_list.remove("Latin Based")
- for layer in alpha_unicode_split(decoded_sequence):
- sequence_frequencies: TypeCounter[str] = Counter(layer)
- most_common = sequence_frequencies.most_common()
- character_count: int = len(layer)
- if character_count <= TOO_SMALL_SEQUENCE:
- continue
- popular_character_ordered: list[str] = [c for c, o in most_common]
- for language in lg_inclusion_list or alphabet_languages(
- popular_character_ordered, ignore_non_latin
- ):
- ratio: float = characters_popularity_compare(
- language, popular_character_ordered
- )
- if ratio < threshold:
- continue
- elif ratio >= 0.8:
- sufficient_match_count += 1
- results.append((language, round(ratio, 4)))
- if sufficient_match_count >= 3:
- break
- return sorted(
- filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
- )
|