_cookie_helpers.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. """
  2. Internal cookie handling helpers.
  3. This module contains internal utilities for cookie parsing and manipulation.
  4. These are not part of the public API and may change without notice.
  5. """
  6. import re
  7. from http.cookies import Morsel
  8. from typing import List, Optional, Sequence, Tuple, cast
  9. from .log import internal_logger
  10. __all__ = (
  11. "parse_set_cookie_headers",
  12. "parse_cookie_header",
  13. "preserve_morsel_with_coded_value",
  14. )
  15. # Cookie parsing constants
  16. # Allow more characters in cookie names to handle real-world cookies
  17. # that don't strictly follow RFC standards (fixes #2683)
  18. # RFC 6265 defines cookie-name token as per RFC 2616 Section 2.2,
  19. # but many servers send cookies with characters like {} [] () etc.
  20. # This makes the cookie parser more tolerant of real-world cookies
  21. # while still providing some validation to catch obviously malformed names.
  22. _COOKIE_NAME_RE = re.compile(r"^[!#$%&\'()*+\-./0-9:<=>?@A-Z\[\]^_`a-z{|}~]+$")
  23. _COOKIE_KNOWN_ATTRS = frozenset( # AKA Morsel._reserved
  24. (
  25. "path",
  26. "domain",
  27. "max-age",
  28. "expires",
  29. "secure",
  30. "httponly",
  31. "samesite",
  32. "partitioned",
  33. "version",
  34. "comment",
  35. )
  36. )
  37. _COOKIE_BOOL_ATTRS = frozenset( # AKA Morsel._flags
  38. ("secure", "httponly", "partitioned")
  39. )
  40. # SimpleCookie's pattern for parsing cookies with relaxed validation
  41. # Based on http.cookies pattern but extended to allow more characters in cookie names
  42. # to handle real-world cookies (fixes #2683)
  43. _COOKIE_PATTERN = re.compile(
  44. r"""
  45. \s* # Optional whitespace at start of cookie
  46. (?P<key> # Start of group 'key'
  47. # aiohttp has extended to include [] for compatibility with real-world cookies
  48. [\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\[\]]+ # Any word of at least one letter
  49. ) # End of group 'key'
  50. ( # Optional group: there may not be a value.
  51. \s*=\s* # Equal Sign
  52. (?P<val> # Start of group 'val'
  53. "(?:[^\\"]|\\.)*" # Any double-quoted string (properly closed)
  54. | # or
  55. "[^";]* # Unmatched opening quote (differs from SimpleCookie - issue #7993)
  56. | # or
  57. # Special case for "expires" attr - RFC 822, RFC 850, RFC 1036, RFC 1123
  58. (\w{3,6}day|\w{3}),\s # Day of the week or abbreviated day (with comma)
  59. [\w\d\s-]{9,11}\s[\d:]{8}\s # Date and time in specific format
  60. (GMT|[+-]\d{4}) # Timezone: GMT or RFC 2822 offset like -0000, +0100
  61. # NOTE: RFC 2822 timezone support is an aiohttp extension
  62. # for issue #4493 - SimpleCookie does NOT support this
  63. | # or
  64. # ANSI C asctime() format: "Wed Jun 9 10:18:14 2021"
  65. # NOTE: This is an aiohttp extension for issue #4327 - SimpleCookie does NOT support this format
  66. \w{3}\s+\w{3}\s+[\s\d]\d\s+\d{2}:\d{2}:\d{2}\s+\d{4}
  67. | # or
  68. [\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=\[\]]* # Any word or empty string
  69. ) # End of group 'val'
  70. )? # End of optional value group
  71. \s* # Any number of spaces.
  72. (\s+|;|$) # Ending either at space, semicolon, or EOS.
  73. """,
  74. re.VERBOSE | re.ASCII,
  75. )
  76. def preserve_morsel_with_coded_value(cookie: Morsel[str]) -> Morsel[str]:
  77. """
  78. Preserve a Morsel's coded_value exactly as received from the server.
  79. This function ensures that cookie encoding is preserved exactly as sent by
  80. the server, which is critical for compatibility with old servers that have
  81. strict requirements about cookie formats.
  82. This addresses the issue described in https://github.com/aio-libs/aiohttp/pull/1453
  83. where Python's SimpleCookie would re-encode cookies, breaking authentication
  84. with certain servers.
  85. Args:
  86. cookie: A Morsel object from SimpleCookie
  87. Returns:
  88. A Morsel object with preserved coded_value
  89. """
  90. mrsl_val = cast("Morsel[str]", cookie.get(cookie.key, Morsel()))
  91. # We use __setstate__ instead of the public set() API because it allows us to
  92. # bypass validation and set already validated state. This is more stable than
  93. # setting protected attributes directly and unlikely to change since it would
  94. # break pickling.
  95. mrsl_val.__setstate__( # type: ignore[attr-defined]
  96. {"key": cookie.key, "value": cookie.value, "coded_value": cookie.coded_value}
  97. )
  98. return mrsl_val
  99. _unquote_sub = re.compile(r"\\(?:([0-3][0-7][0-7])|(.))").sub
  100. def _unquote_replace(m: re.Match[str]) -> str:
  101. """
  102. Replace function for _unquote_sub regex substitution.
  103. Handles escaped characters in cookie values:
  104. - Octal sequences are converted to their character representation
  105. - Other escaped characters are unescaped by removing the backslash
  106. """
  107. if m[1]:
  108. return chr(int(m[1], 8))
  109. return m[2]
  110. def _unquote(value: str) -> str:
  111. """
  112. Unquote a cookie value.
  113. Vendored from http.cookies._unquote to ensure compatibility.
  114. Note: The original implementation checked for None, but we've removed
  115. that check since all callers already ensure the value is not None.
  116. """
  117. # If there aren't any doublequotes,
  118. # then there can't be any special characters. See RFC 2109.
  119. if len(value) < 2:
  120. return value
  121. if value[0] != '"' or value[-1] != '"':
  122. return value
  123. # We have to assume that we must decode this string.
  124. # Down to work.
  125. # Remove the "s
  126. value = value[1:-1]
  127. # Check for special sequences. Examples:
  128. # \012 --> \n
  129. # \" --> "
  130. #
  131. return _unquote_sub(_unquote_replace, value)
  132. def parse_cookie_header(header: str) -> List[Tuple[str, Morsel[str]]]:
  133. """
  134. Parse a Cookie header according to RFC 6265 Section 5.4.
  135. Cookie headers contain only name-value pairs separated by semicolons.
  136. There are no attributes in Cookie headers - even names that match
  137. attribute names (like 'path' or 'secure') should be treated as cookies.
  138. This parser uses the same regex-based approach as parse_set_cookie_headers
  139. to properly handle quoted values that may contain semicolons. When the
  140. regex fails to match a malformed cookie, it falls back to simple parsing
  141. to ensure subsequent cookies are not lost
  142. https://github.com/aio-libs/aiohttp/issues/11632
  143. Args:
  144. header: The Cookie header value to parse
  145. Returns:
  146. List of (name, Morsel) tuples for compatibility with SimpleCookie.update()
  147. """
  148. if not header:
  149. return []
  150. cookies: List[Tuple[str, Morsel[str]]] = []
  151. morsel: Morsel[str]
  152. i = 0
  153. n = len(header)
  154. invalid_names = []
  155. while i < n:
  156. # Use the same pattern as parse_set_cookie_headers to find cookies
  157. match = _COOKIE_PATTERN.match(header, i)
  158. if not match:
  159. # Fallback for malformed cookies https://github.com/aio-libs/aiohttp/issues/11632
  160. # Find next semicolon to skip or attempt simple key=value parsing
  161. next_semi = header.find(";", i)
  162. eq_pos = header.find("=", i)
  163. # Try to extract key=value if '=' comes before ';'
  164. if eq_pos != -1 and (next_semi == -1 or eq_pos < next_semi):
  165. end_pos = next_semi if next_semi != -1 else n
  166. key = header[i:eq_pos].strip()
  167. value = header[eq_pos + 1 : end_pos].strip()
  168. # Validate the name (same as regex path)
  169. if not _COOKIE_NAME_RE.match(key):
  170. invalid_names.append(key)
  171. else:
  172. morsel = Morsel()
  173. morsel.__setstate__( # type: ignore[attr-defined]
  174. {"key": key, "value": _unquote(value), "coded_value": value}
  175. )
  176. cookies.append((key, morsel))
  177. # Move to next cookie or end
  178. i = next_semi + 1 if next_semi != -1 else n
  179. continue
  180. key = match.group("key")
  181. value = match.group("val") or ""
  182. i = match.end(0)
  183. # Validate the name
  184. if not key or not _COOKIE_NAME_RE.match(key):
  185. invalid_names.append(key)
  186. continue
  187. # Create new morsel
  188. morsel = Morsel()
  189. # Preserve the original value as coded_value (with quotes if present)
  190. # We use __setstate__ instead of the public set() API because it allows us to
  191. # bypass validation and set already validated state. This is more stable than
  192. # setting protected attributes directly and unlikely to change since it would
  193. # break pickling.
  194. morsel.__setstate__( # type: ignore[attr-defined]
  195. {"key": key, "value": _unquote(value), "coded_value": value}
  196. )
  197. cookies.append((key, morsel))
  198. if invalid_names:
  199. internal_logger.debug(
  200. "Cannot load cookie. Illegal cookie names: %r", invalid_names
  201. )
  202. return cookies
  203. def parse_set_cookie_headers(headers: Sequence[str]) -> List[Tuple[str, Morsel[str]]]:
  204. """
  205. Parse cookie headers using a vendored version of SimpleCookie parsing.
  206. This implementation is based on SimpleCookie.__parse_string to ensure
  207. compatibility with how SimpleCookie parses cookies, including handling
  208. of malformed cookies with missing semicolons.
  209. This function is used for both Cookie and Set-Cookie headers in order to be
  210. forgiving. Ideally we would have followed RFC 6265 Section 5.2 (for Cookie
  211. headers) and RFC 6265 Section 4.2.1 (for Set-Cookie headers), but the
  212. real world data makes it impossible since we need to be a bit more forgiving.
  213. NOTE: This implementation differs from SimpleCookie in handling unmatched quotes.
  214. SimpleCookie will stop parsing when it encounters a cookie value with an unmatched
  215. quote (e.g., 'cookie="value'), causing subsequent cookies to be silently dropped.
  216. This implementation handles unmatched quotes more gracefully to prevent cookie loss.
  217. See https://github.com/aio-libs/aiohttp/issues/7993
  218. """
  219. parsed_cookies: List[Tuple[str, Morsel[str]]] = []
  220. for header in headers:
  221. if not header:
  222. continue
  223. # Parse cookie string using SimpleCookie's algorithm
  224. i = 0
  225. n = len(header)
  226. current_morsel: Optional[Morsel[str]] = None
  227. morsel_seen = False
  228. while 0 <= i < n:
  229. # Start looking for a cookie
  230. match = _COOKIE_PATTERN.match(header, i)
  231. if not match:
  232. # No more cookies
  233. break
  234. key, value = match.group("key"), match.group("val")
  235. i = match.end(0)
  236. lower_key = key.lower()
  237. if key[0] == "$":
  238. if not morsel_seen:
  239. # We ignore attributes which pertain to the cookie
  240. # mechanism as a whole, such as "$Version".
  241. continue
  242. # Process as attribute
  243. if current_morsel is not None:
  244. attr_lower_key = lower_key[1:]
  245. if attr_lower_key in _COOKIE_KNOWN_ATTRS:
  246. current_morsel[attr_lower_key] = value or ""
  247. elif lower_key in _COOKIE_KNOWN_ATTRS:
  248. if not morsel_seen:
  249. # Invalid cookie string - attribute before cookie
  250. break
  251. if lower_key in _COOKIE_BOOL_ATTRS:
  252. # Boolean attribute with any value should be True
  253. if current_morsel is not None and current_morsel.isReservedKey(key):
  254. current_morsel[lower_key] = True
  255. elif value is None:
  256. # Invalid cookie string - non-boolean attribute without value
  257. break
  258. elif current_morsel is not None:
  259. # Regular attribute with value
  260. current_morsel[lower_key] = _unquote(value)
  261. elif value is not None:
  262. # This is a cookie name=value pair
  263. # Validate the name
  264. if key in _COOKIE_KNOWN_ATTRS or not _COOKIE_NAME_RE.match(key):
  265. internal_logger.warning(
  266. "Can not load cookies: Illegal cookie name %r", key
  267. )
  268. current_morsel = None
  269. else:
  270. # Create new morsel
  271. current_morsel = Morsel()
  272. # Preserve the original value as coded_value (with quotes if present)
  273. # We use __setstate__ instead of the public set() API because it allows us to
  274. # bypass validation and set already validated state. This is more stable than
  275. # setting protected attributes directly and unlikely to change since it would
  276. # break pickling.
  277. current_morsel.__setstate__( # type: ignore[attr-defined]
  278. {"key": key, "value": _unquote(value), "coded_value": value}
  279. )
  280. parsed_cookies.append((key, current_morsel))
  281. morsel_seen = True
  282. else:
  283. # Invalid cookie string - no value for non-attribute
  284. break
  285. return parsed_cookies