legacy.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. from __future__ import annotations
  2. from typing import TYPE_CHECKING, Any
  3. from warnings import warn
  4. from .api import from_bytes
  5. from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
  6. if TYPE_CHECKING:
  7. from typing import TypedDict
  8. class ResultDict(TypedDict):
  9. encoding: str | None
  10. language: str
  11. confidence: float | None
  12. def detect(
  13. byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
  14. ) -> ResultDict:
  15. """
  16. chardet legacy method
  17. Detect the encoding of the given byte string. It should be mostly backward-compatible.
  18. Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
  19. This function is deprecated and should be used to migrate your project easily, consult the documentation for
  20. further information. Not planned for removal.
  21. :param byte_str: The byte sequence to examine.
  22. :param should_rename_legacy: Should we rename legacy encodings
  23. to their more modern equivalents?
  24. """
  25. if len(kwargs):
  26. warn(
  27. f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
  28. )
  29. if not isinstance(byte_str, (bytearray, bytes)):
  30. raise TypeError( # pragma: nocover
  31. f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
  32. )
  33. if isinstance(byte_str, bytearray):
  34. byte_str = bytes(byte_str)
  35. r = from_bytes(byte_str).best()
  36. encoding = r.encoding if r is not None else None
  37. language = r.language if r is not None and r.language != "Unknown" else ""
  38. confidence = 1.0 - r.chaos if r is not None else None
  39. # automatically lower confidence
  40. # on small bytes samples.
  41. # https://github.com/jawah/charset_normalizer/issues/391
  42. if (
  43. confidence is not None
  44. and confidence >= 0.9
  45. and encoding
  46. not in {
  47. "utf_8",
  48. "ascii",
  49. }
  50. and r.bom is False # type: ignore[union-attr]
  51. and len(byte_str) < TOO_SMALL_SEQUENCE
  52. ):
  53. confidence -= 0.2
  54. # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
  55. # but chardet does return 'utf-8-sig' and it is a valid codec name.
  56. if r is not None and encoding == "utf_8" and r.bom:
  57. encoding += "_sig"
  58. if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
  59. encoding = CHARDET_CORRESPONDENCE[encoding]
  60. return {
  61. "encoding": encoding,
  62. "language": language,
  63. "confidence": confidence,
  64. }