http_parser.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086
  1. import abc
  2. import asyncio
  3. import re
  4. import string
  5. from contextlib import suppress
  6. from enum import IntEnum
  7. from typing import (
  8. Any,
  9. ClassVar,
  10. Final,
  11. Generic,
  12. List,
  13. Literal,
  14. NamedTuple,
  15. Optional,
  16. Pattern,
  17. Set,
  18. Tuple,
  19. Type,
  20. TypeVar,
  21. Union,
  22. )
  23. from multidict import CIMultiDict, CIMultiDictProxy, istr
  24. from yarl import URL
  25. from . import hdrs
  26. from .base_protocol import BaseProtocol
  27. from .compression_utils import (
  28. DEFAULT_MAX_DECOMPRESS_SIZE,
  29. HAS_BROTLI,
  30. HAS_ZSTD,
  31. BrotliDecompressor,
  32. ZLibDecompressor,
  33. ZSTDDecompressor,
  34. )
  35. from .helpers import (
  36. _EXC_SENTINEL,
  37. DEBUG,
  38. EMPTY_BODY_METHODS,
  39. EMPTY_BODY_STATUS_CODES,
  40. NO_EXTENSIONS,
  41. BaseTimerContext,
  42. set_exception,
  43. )
  44. from .http_exceptions import (
  45. BadHttpMessage,
  46. BadHttpMethod,
  47. BadStatusLine,
  48. ContentEncodingError,
  49. ContentLengthError,
  50. DecompressSizeError,
  51. InvalidHeader,
  52. InvalidURLError,
  53. LineTooLong,
  54. TransferEncodingError,
  55. )
  56. from .http_writer import HttpVersion, HttpVersion10
  57. from .streams import EMPTY_PAYLOAD, StreamReader
  58. from .typedefs import RawHeaders
  59. __all__ = (
  60. "HeadersParser",
  61. "HttpParser",
  62. "HttpRequestParser",
  63. "HttpResponseParser",
  64. "RawRequestMessage",
  65. "RawResponseMessage",
  66. )
  67. _SEP = Literal[b"\r\n", b"\n"]
  68. ASCIISET: Final[Set[str]] = set(string.printable)
  69. # See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
  70. # and https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens
  71. #
  72. # method = token
  73. # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
  74. # "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
  75. # token = 1*tchar
  76. _TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
  77. TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
  78. VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
  79. DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
  80. HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+")
  81. class RawRequestMessage(NamedTuple):
  82. method: str
  83. path: str
  84. version: HttpVersion
  85. headers: "CIMultiDictProxy[str]"
  86. raw_headers: RawHeaders
  87. should_close: bool
  88. compression: Optional[str]
  89. upgrade: bool
  90. chunked: bool
  91. url: URL
  92. class RawResponseMessage(NamedTuple):
  93. version: HttpVersion
  94. code: int
  95. reason: str
  96. headers: CIMultiDictProxy[str]
  97. raw_headers: RawHeaders
  98. should_close: bool
  99. compression: Optional[str]
  100. upgrade: bool
  101. chunked: bool
  102. _MsgT = TypeVar("_MsgT", RawRequestMessage, RawResponseMessage)
  103. class ParseState(IntEnum):
  104. PARSE_NONE = 0
  105. PARSE_LENGTH = 1
  106. PARSE_CHUNKED = 2
  107. PARSE_UNTIL_EOF = 3
  108. class ChunkState(IntEnum):
  109. PARSE_CHUNKED_SIZE = 0
  110. PARSE_CHUNKED_CHUNK = 1
  111. PARSE_CHUNKED_CHUNK_EOF = 2
  112. PARSE_MAYBE_TRAILERS = 3
  113. PARSE_TRAILERS = 4
  114. class HeadersParser:
  115. def __init__(
  116. self,
  117. max_line_size: int = 8190,
  118. max_headers: int = 32768,
  119. max_field_size: int = 8190,
  120. lax: bool = False,
  121. ) -> None:
  122. self.max_line_size = max_line_size
  123. self.max_headers = max_headers
  124. self.max_field_size = max_field_size
  125. self._lax = lax
  126. def parse_headers(
  127. self, lines: List[bytes]
  128. ) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
  129. headers: CIMultiDict[str] = CIMultiDict()
  130. # note: "raw" does not mean inclusion of OWS before/after the field value
  131. raw_headers = []
  132. lines_idx = 0
  133. line = lines[lines_idx]
  134. line_count = len(lines)
  135. while line:
  136. # Parse initial header name : value pair.
  137. try:
  138. bname, bvalue = line.split(b":", 1)
  139. except ValueError:
  140. raise InvalidHeader(line) from None
  141. if len(bname) == 0:
  142. raise InvalidHeader(bname)
  143. # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
  144. if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
  145. raise InvalidHeader(line)
  146. bvalue = bvalue.lstrip(b" \t")
  147. if len(bname) > self.max_field_size:
  148. raise LineTooLong(
  149. "request header name {}".format(
  150. bname.decode("utf8", "backslashreplace")
  151. ),
  152. str(self.max_field_size),
  153. str(len(bname)),
  154. )
  155. name = bname.decode("utf-8", "surrogateescape")
  156. if not TOKENRE.fullmatch(name):
  157. raise InvalidHeader(bname)
  158. header_length = len(bvalue)
  159. # next line
  160. lines_idx += 1
  161. line = lines[lines_idx]
  162. # consume continuation lines
  163. continuation = self._lax and line and line[0] in (32, 9) # (' ', '\t')
  164. # Deprecated: https://www.rfc-editor.org/rfc/rfc9112.html#name-obsolete-line-folding
  165. if continuation:
  166. bvalue_lst = [bvalue]
  167. while continuation:
  168. header_length += len(line)
  169. if header_length > self.max_field_size:
  170. raise LineTooLong(
  171. "request header field {}".format(
  172. bname.decode("utf8", "backslashreplace")
  173. ),
  174. str(self.max_field_size),
  175. str(header_length),
  176. )
  177. bvalue_lst.append(line)
  178. # next line
  179. lines_idx += 1
  180. if lines_idx < line_count:
  181. line = lines[lines_idx]
  182. if line:
  183. continuation = line[0] in (32, 9) # (' ', '\t')
  184. else:
  185. line = b""
  186. break
  187. bvalue = b"".join(bvalue_lst)
  188. else:
  189. if header_length > self.max_field_size:
  190. raise LineTooLong(
  191. "request header field {}".format(
  192. bname.decode("utf8", "backslashreplace")
  193. ),
  194. str(self.max_field_size),
  195. str(header_length),
  196. )
  197. bvalue = bvalue.strip(b" \t")
  198. value = bvalue.decode("utf-8", "surrogateescape")
  199. # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
  200. if "\n" in value or "\r" in value or "\x00" in value:
  201. raise InvalidHeader(bvalue)
  202. headers.add(name, value)
  203. raw_headers.append((bname, bvalue))
  204. return (CIMultiDictProxy(headers), tuple(raw_headers))
  205. def _is_supported_upgrade(headers: CIMultiDictProxy[str]) -> bool:
  206. """Check if the upgrade header is supported."""
  207. u = headers.get(hdrs.UPGRADE, "")
  208. # .lower() can transform non-ascii characters.
  209. return u.isascii() and u.lower() in {"tcp", "websocket"}
  210. class HttpParser(abc.ABC, Generic[_MsgT]):
  211. lax: ClassVar[bool] = False
  212. def __init__(
  213. self,
  214. protocol: Optional[BaseProtocol] = None,
  215. loop: Optional[asyncio.AbstractEventLoop] = None,
  216. limit: int = 2**16,
  217. max_line_size: int = 8190,
  218. max_headers: int = 32768,
  219. max_field_size: int = 8190,
  220. timer: Optional[BaseTimerContext] = None,
  221. code: Optional[int] = None,
  222. method: Optional[str] = None,
  223. payload_exception: Optional[Type[BaseException]] = None,
  224. response_with_body: bool = True,
  225. read_until_eof: bool = False,
  226. auto_decompress: bool = True,
  227. ) -> None:
  228. self.protocol = protocol
  229. self.loop = loop
  230. self.max_line_size = max_line_size
  231. self.max_headers = max_headers
  232. self.max_field_size = max_field_size
  233. self.timer = timer
  234. self.code = code
  235. self.method = method
  236. self.payload_exception = payload_exception
  237. self.response_with_body = response_with_body
  238. self.read_until_eof = read_until_eof
  239. self._lines: List[bytes] = []
  240. self._tail = b""
  241. self._upgraded = False
  242. self._payload = None
  243. self._payload_parser: Optional[HttpPayloadParser] = None
  244. self._auto_decompress = auto_decompress
  245. self._limit = limit
  246. self._headers_parser = HeadersParser(
  247. max_line_size, max_headers, max_field_size, self.lax
  248. )
  249. @abc.abstractmethod
  250. def parse_message(self, lines: List[bytes]) -> _MsgT: ...
  251. @abc.abstractmethod
  252. def _is_chunked_te(self, te: str) -> bool: ...
  253. def feed_eof(self) -> Optional[_MsgT]:
  254. if self._payload_parser is not None:
  255. self._payload_parser.feed_eof()
  256. self._payload_parser = None
  257. else:
  258. # try to extract partial message
  259. if self._tail:
  260. self._lines.append(self._tail)
  261. if self._lines:
  262. if self._lines[-1] != "\r\n":
  263. self._lines.append(b"")
  264. with suppress(Exception):
  265. return self.parse_message(self._lines)
  266. return None
  267. def feed_data(
  268. self,
  269. data: bytes,
  270. SEP: _SEP = b"\r\n",
  271. EMPTY: bytes = b"",
  272. CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH,
  273. METH_CONNECT: str = hdrs.METH_CONNECT,
  274. SEC_WEBSOCKET_KEY1: istr = hdrs.SEC_WEBSOCKET_KEY1,
  275. ) -> Tuple[List[Tuple[_MsgT, StreamReader]], bool, bytes]:
  276. messages = []
  277. if self._tail:
  278. data, self._tail = self._tail + data, b""
  279. data_len = len(data)
  280. start_pos = 0
  281. loop = self.loop
  282. should_close = False
  283. while start_pos < data_len:
  284. # read HTTP message (request/response line + headers), \r\n\r\n
  285. # and split by lines
  286. if self._payload_parser is None and not self._upgraded:
  287. pos = data.find(SEP, start_pos)
  288. # consume \r\n
  289. if pos == start_pos and not self._lines:
  290. start_pos = pos + len(SEP)
  291. continue
  292. if pos >= start_pos:
  293. if should_close:
  294. raise BadHttpMessage("Data after `Connection: close`")
  295. # line found
  296. line = data[start_pos:pos]
  297. if SEP == b"\n": # For lax response parsing
  298. line = line.rstrip(b"\r")
  299. self._lines.append(line)
  300. start_pos = pos + len(SEP)
  301. # \r\n\r\n found
  302. if self._lines[-1] == EMPTY:
  303. try:
  304. msg: _MsgT = self.parse_message(self._lines)
  305. finally:
  306. self._lines.clear()
  307. def get_content_length() -> Optional[int]:
  308. # payload length
  309. length_hdr = msg.headers.get(CONTENT_LENGTH)
  310. if length_hdr is None:
  311. return None
  312. # Shouldn't allow +/- or other number formats.
  313. # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
  314. # msg.headers is already stripped of leading/trailing wsp
  315. if not DIGITS.fullmatch(length_hdr):
  316. raise InvalidHeader(CONTENT_LENGTH)
  317. return int(length_hdr)
  318. length = get_content_length()
  319. # do not support old websocket spec
  320. if SEC_WEBSOCKET_KEY1 in msg.headers:
  321. raise InvalidHeader(SEC_WEBSOCKET_KEY1)
  322. self._upgraded = msg.upgrade and _is_supported_upgrade(
  323. msg.headers
  324. )
  325. method = getattr(msg, "method", self.method)
  326. # code is only present on responses
  327. code = getattr(msg, "code", 0)
  328. assert self.protocol is not None
  329. # calculate payload
  330. empty_body = code in EMPTY_BODY_STATUS_CODES or bool(
  331. method and method in EMPTY_BODY_METHODS
  332. )
  333. if not empty_body and (
  334. ((length is not None and length > 0) or msg.chunked)
  335. and not self._upgraded
  336. ):
  337. payload = StreamReader(
  338. self.protocol,
  339. timer=self.timer,
  340. loop=loop,
  341. limit=self._limit,
  342. )
  343. payload_parser = HttpPayloadParser(
  344. payload,
  345. length=length,
  346. chunked=msg.chunked,
  347. method=method,
  348. compression=msg.compression,
  349. code=self.code,
  350. response_with_body=self.response_with_body,
  351. auto_decompress=self._auto_decompress,
  352. lax=self.lax,
  353. headers_parser=self._headers_parser,
  354. )
  355. if not payload_parser.done:
  356. self._payload_parser = payload_parser
  357. elif method == METH_CONNECT:
  358. assert isinstance(msg, RawRequestMessage)
  359. payload = StreamReader(
  360. self.protocol,
  361. timer=self.timer,
  362. loop=loop,
  363. limit=self._limit,
  364. )
  365. self._upgraded = True
  366. self._payload_parser = HttpPayloadParser(
  367. payload,
  368. method=msg.method,
  369. compression=msg.compression,
  370. auto_decompress=self._auto_decompress,
  371. lax=self.lax,
  372. headers_parser=self._headers_parser,
  373. )
  374. elif not empty_body and length is None and self.read_until_eof:
  375. payload = StreamReader(
  376. self.protocol,
  377. timer=self.timer,
  378. loop=loop,
  379. limit=self._limit,
  380. )
  381. payload_parser = HttpPayloadParser(
  382. payload,
  383. length=length,
  384. chunked=msg.chunked,
  385. method=method,
  386. compression=msg.compression,
  387. code=self.code,
  388. response_with_body=self.response_with_body,
  389. auto_decompress=self._auto_decompress,
  390. lax=self.lax,
  391. headers_parser=self._headers_parser,
  392. )
  393. if not payload_parser.done:
  394. self._payload_parser = payload_parser
  395. else:
  396. payload = EMPTY_PAYLOAD
  397. messages.append((msg, payload))
  398. should_close = msg.should_close
  399. else:
  400. self._tail = data[start_pos:]
  401. data = EMPTY
  402. break
  403. # no parser, just store
  404. elif self._payload_parser is None and self._upgraded:
  405. assert not self._lines
  406. break
  407. # feed payload
  408. elif data and start_pos < data_len:
  409. assert not self._lines
  410. assert self._payload_parser is not None
  411. try:
  412. eof, data = self._payload_parser.feed_data(data[start_pos:], SEP)
  413. except BaseException as underlying_exc:
  414. reraised_exc = underlying_exc
  415. if self.payload_exception is not None:
  416. reraised_exc = self.payload_exception(str(underlying_exc))
  417. set_exception(
  418. self._payload_parser.payload,
  419. reraised_exc,
  420. underlying_exc,
  421. )
  422. eof = True
  423. data = b""
  424. if isinstance(
  425. underlying_exc, (InvalidHeader, TransferEncodingError)
  426. ):
  427. raise
  428. if eof:
  429. start_pos = 0
  430. data_len = len(data)
  431. self._payload_parser = None
  432. continue
  433. else:
  434. break
  435. if data and start_pos < data_len:
  436. data = data[start_pos:]
  437. else:
  438. data = EMPTY
  439. return messages, self._upgraded, data
  440. def parse_headers(
  441. self, lines: List[bytes]
  442. ) -> Tuple[
  443. "CIMultiDictProxy[str]", RawHeaders, Optional[bool], Optional[str], bool, bool
  444. ]:
  445. """Parses RFC 5322 headers from a stream.
  446. Line continuations are supported. Returns list of header name
  447. and value pairs. Header name is in upper case.
  448. """
  449. headers, raw_headers = self._headers_parser.parse_headers(lines)
  450. close_conn = None
  451. encoding = None
  452. upgrade = False
  453. chunked = False
  454. # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6
  455. # https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf
  456. singletons = (
  457. hdrs.CONTENT_LENGTH,
  458. hdrs.CONTENT_LOCATION,
  459. hdrs.CONTENT_RANGE,
  460. hdrs.CONTENT_TYPE,
  461. hdrs.ETAG,
  462. hdrs.HOST,
  463. hdrs.MAX_FORWARDS,
  464. hdrs.SERVER,
  465. hdrs.TRANSFER_ENCODING,
  466. hdrs.USER_AGENT,
  467. )
  468. bad_hdr = next((h for h in singletons if len(headers.getall(h, ())) > 1), None)
  469. if bad_hdr is not None:
  470. raise BadHttpMessage(f"Duplicate '{bad_hdr}' header found.")
  471. # keep-alive
  472. conn = headers.get(hdrs.CONNECTION)
  473. if conn:
  474. v = conn.lower()
  475. if v == "close":
  476. close_conn = True
  477. elif v == "keep-alive":
  478. close_conn = False
  479. # https://www.rfc-editor.org/rfc/rfc9110.html#name-101-switching-protocols
  480. elif v == "upgrade" and headers.get(hdrs.UPGRADE):
  481. upgrade = True
  482. # encoding
  483. enc = headers.get(hdrs.CONTENT_ENCODING, "")
  484. if enc.isascii() and enc.lower() in {"gzip", "deflate", "br", "zstd"}:
  485. encoding = enc
  486. # chunking
  487. te = headers.get(hdrs.TRANSFER_ENCODING)
  488. if te is not None:
  489. if self._is_chunked_te(te):
  490. chunked = True
  491. if hdrs.CONTENT_LENGTH in headers:
  492. raise BadHttpMessage(
  493. "Transfer-Encoding can't be present with Content-Length",
  494. )
  495. return (headers, raw_headers, close_conn, encoding, upgrade, chunked)
  496. def set_upgraded(self, val: bool) -> None:
  497. """Set connection upgraded (to websocket) mode.
  498. :param bool val: new state.
  499. """
  500. self._upgraded = val
  501. class HttpRequestParser(HttpParser[RawRequestMessage]):
  502. """Read request status line.
  503. Exception .http_exceptions.BadStatusLine
  504. could be raised in case of any errors in status line.
  505. Returns RawRequestMessage.
  506. """
  507. def parse_message(self, lines: List[bytes]) -> RawRequestMessage:
  508. # request line
  509. line = lines[0].decode("utf-8", "surrogateescape")
  510. try:
  511. method, path, version = line.split(" ", maxsplit=2)
  512. except ValueError:
  513. raise BadHttpMethod(line) from None
  514. if len(path) > self.max_line_size:
  515. raise LineTooLong(
  516. "Status line is too long", str(self.max_line_size), str(len(path))
  517. )
  518. # method
  519. if not TOKENRE.fullmatch(method):
  520. raise BadHttpMethod(method)
  521. # version
  522. match = VERSRE.fullmatch(version)
  523. if match is None:
  524. raise BadStatusLine(line)
  525. version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
  526. if method == "CONNECT":
  527. # authority-form,
  528. # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.3
  529. url = URL.build(authority=path, encoded=True)
  530. elif path.startswith("/"):
  531. # origin-form,
  532. # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1
  533. path_part, _hash_separator, url_fragment = path.partition("#")
  534. path_part, _question_mark_separator, qs_part = path_part.partition("?")
  535. # NOTE: `yarl.URL.build()` is used to mimic what the Cython-based
  536. # NOTE: parser does, otherwise it results into the same
  537. # NOTE: HTTP Request-Line input producing different
  538. # NOTE: `yarl.URL()` objects
  539. url = URL.build(
  540. path=path_part,
  541. query_string=qs_part,
  542. fragment=url_fragment,
  543. encoded=True,
  544. )
  545. elif path == "*" and method == "OPTIONS":
  546. # asterisk-form,
  547. url = URL(path, encoded=True)
  548. else:
  549. # absolute-form for proxy maybe,
  550. # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.2
  551. url = URL(path, encoded=True)
  552. if url.scheme == "":
  553. # not absolute-form
  554. raise InvalidURLError(
  555. path.encode(errors="surrogateescape").decode("latin1")
  556. )
  557. # read headers
  558. (
  559. headers,
  560. raw_headers,
  561. close,
  562. compression,
  563. upgrade,
  564. chunked,
  565. ) = self.parse_headers(lines[1:])
  566. if close is None: # then the headers weren't set in the request
  567. if version_o <= HttpVersion10: # HTTP 1.0 must asks to not close
  568. close = True
  569. else: # HTTP 1.1 must ask to close.
  570. close = False
  571. return RawRequestMessage(
  572. method,
  573. path,
  574. version_o,
  575. headers,
  576. raw_headers,
  577. close,
  578. compression,
  579. upgrade,
  580. chunked,
  581. url,
  582. )
  583. def _is_chunked_te(self, te: str) -> bool:
  584. te = te.rsplit(",", maxsplit=1)[-1].strip(" \t")
  585. # .lower() transforms some non-ascii chars, so must check first.
  586. if te.isascii() and te.lower() == "chunked":
  587. return True
  588. # https://www.rfc-editor.org/rfc/rfc9112#section-6.3-2.4.3
  589. raise BadHttpMessage("Request has invalid `Transfer-Encoding`")
  590. class HttpResponseParser(HttpParser[RawResponseMessage]):
  591. """Read response status line and headers.
  592. BadStatusLine could be raised in case of any errors in status line.
  593. Returns RawResponseMessage.
  594. """
  595. # Lax mode should only be enabled on response parser.
  596. lax = not DEBUG
  597. def feed_data(
  598. self,
  599. data: bytes,
  600. SEP: Optional[_SEP] = None,
  601. *args: Any,
  602. **kwargs: Any,
  603. ) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]:
  604. if SEP is None:
  605. SEP = b"\r\n" if DEBUG else b"\n"
  606. return super().feed_data(data, SEP, *args, **kwargs)
  607. def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
  608. line = lines[0].decode("utf-8", "surrogateescape")
  609. try:
  610. version, status = line.split(maxsplit=1)
  611. except ValueError:
  612. raise BadStatusLine(line) from None
  613. try:
  614. status, reason = status.split(maxsplit=1)
  615. except ValueError:
  616. status = status.strip()
  617. reason = ""
  618. if len(reason) > self.max_line_size:
  619. raise LineTooLong(
  620. "Status line is too long", str(self.max_line_size), str(len(reason))
  621. )
  622. # version
  623. match = VERSRE.fullmatch(version)
  624. if match is None:
  625. raise BadStatusLine(line)
  626. version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
  627. # The status code is a three-digit ASCII number, no padding
  628. if len(status) != 3 or not DIGITS.fullmatch(status):
  629. raise BadStatusLine(line)
  630. status_i = int(status)
  631. # read headers
  632. (
  633. headers,
  634. raw_headers,
  635. close,
  636. compression,
  637. upgrade,
  638. chunked,
  639. ) = self.parse_headers(lines[1:])
  640. if close is None:
  641. if version_o <= HttpVersion10:
  642. close = True
  643. # https://www.rfc-editor.org/rfc/rfc9112.html#name-message-body-length
  644. elif 100 <= status_i < 200 or status_i in {204, 304}:
  645. close = False
  646. elif hdrs.CONTENT_LENGTH in headers or hdrs.TRANSFER_ENCODING in headers:
  647. close = False
  648. else:
  649. # https://www.rfc-editor.org/rfc/rfc9112.html#section-6.3-2.8
  650. close = True
  651. return RawResponseMessage(
  652. version_o,
  653. status_i,
  654. reason.strip(),
  655. headers,
  656. raw_headers,
  657. close,
  658. compression,
  659. upgrade,
  660. chunked,
  661. )
  662. def _is_chunked_te(self, te: str) -> bool:
  663. # https://www.rfc-editor.org/rfc/rfc9112#section-6.3-2.4.2
  664. return te.rsplit(",", maxsplit=1)[-1].strip(" \t").lower() == "chunked"
  665. class HttpPayloadParser:
  666. def __init__(
  667. self,
  668. payload: StreamReader,
  669. length: Optional[int] = None,
  670. chunked: bool = False,
  671. compression: Optional[str] = None,
  672. code: Optional[int] = None,
  673. method: Optional[str] = None,
  674. response_with_body: bool = True,
  675. auto_decompress: bool = True,
  676. lax: bool = False,
  677. *,
  678. headers_parser: HeadersParser,
  679. ) -> None:
  680. self._length = 0
  681. self._type = ParseState.PARSE_UNTIL_EOF
  682. self._chunk = ChunkState.PARSE_CHUNKED_SIZE
  683. self._chunk_size = 0
  684. self._chunk_tail = b""
  685. self._auto_decompress = auto_decompress
  686. self._lax = lax
  687. self._headers_parser = headers_parser
  688. self._trailer_lines: list[bytes] = []
  689. self.done = False
  690. # payload decompression wrapper
  691. if response_with_body and compression and self._auto_decompress:
  692. real_payload: Union[StreamReader, DeflateBuffer] = DeflateBuffer(
  693. payload, compression
  694. )
  695. else:
  696. real_payload = payload
  697. # payload parser
  698. if not response_with_body:
  699. # don't parse payload if it's not expected to be received
  700. self._type = ParseState.PARSE_NONE
  701. real_payload.feed_eof()
  702. self.done = True
  703. elif chunked:
  704. self._type = ParseState.PARSE_CHUNKED
  705. elif length is not None:
  706. self._type = ParseState.PARSE_LENGTH
  707. self._length = length
  708. if self._length == 0:
  709. real_payload.feed_eof()
  710. self.done = True
  711. self.payload = real_payload
  712. def feed_eof(self) -> None:
  713. if self._type == ParseState.PARSE_UNTIL_EOF:
  714. self.payload.feed_eof()
  715. elif self._type == ParseState.PARSE_LENGTH:
  716. raise ContentLengthError(
  717. "Not enough data to satisfy content length header."
  718. )
  719. elif self._type == ParseState.PARSE_CHUNKED:
  720. raise TransferEncodingError(
  721. "Not enough data to satisfy transfer length header."
  722. )
  723. def feed_data(
  724. self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";"
  725. ) -> Tuple[bool, bytes]:
  726. # Read specified amount of bytes
  727. if self._type == ParseState.PARSE_LENGTH:
  728. required = self._length
  729. chunk_len = len(chunk)
  730. if required >= chunk_len:
  731. self._length = required - chunk_len
  732. self.payload.feed_data(chunk, chunk_len)
  733. if self._length == 0:
  734. self.payload.feed_eof()
  735. return True, b""
  736. else:
  737. self._length = 0
  738. self.payload.feed_data(chunk[:required], required)
  739. self.payload.feed_eof()
  740. return True, chunk[required:]
  741. # Chunked transfer encoding parser
  742. elif self._type == ParseState.PARSE_CHUNKED:
  743. if self._chunk_tail:
  744. chunk = self._chunk_tail + chunk
  745. self._chunk_tail = b""
  746. while chunk:
  747. # read next chunk size
  748. if self._chunk == ChunkState.PARSE_CHUNKED_SIZE:
  749. pos = chunk.find(SEP)
  750. if pos >= 0:
  751. i = chunk.find(CHUNK_EXT, 0, pos)
  752. if i >= 0:
  753. size_b = chunk[:i] # strip chunk-extensions
  754. # Verify no LF in the chunk-extension
  755. if b"\n" in (ext := chunk[i:pos]):
  756. exc = TransferEncodingError(
  757. f"Unexpected LF in chunk-extension: {ext!r}"
  758. )
  759. set_exception(self.payload, exc)
  760. raise exc
  761. else:
  762. size_b = chunk[:pos]
  763. if self._lax: # Allow whitespace in lax mode.
  764. size_b = size_b.strip()
  765. if not re.fullmatch(HEXDIGITS, size_b):
  766. exc = TransferEncodingError(
  767. chunk[:pos].decode("ascii", "surrogateescape")
  768. )
  769. set_exception(self.payload, exc)
  770. raise exc
  771. size = int(bytes(size_b), 16)
  772. chunk = chunk[pos + len(SEP) :]
  773. if size == 0: # eof marker
  774. self._chunk = ChunkState.PARSE_TRAILERS
  775. if self._lax and chunk.startswith(b"\r"):
  776. chunk = chunk[1:]
  777. else:
  778. self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
  779. self._chunk_size = size
  780. self.payload.begin_http_chunk_receiving()
  781. else:
  782. self._chunk_tail = chunk
  783. return False, b""
  784. # read chunk and feed buffer
  785. if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK:
  786. required = self._chunk_size
  787. chunk_len = len(chunk)
  788. if required > chunk_len:
  789. self._chunk_size = required - chunk_len
  790. self.payload.feed_data(chunk, chunk_len)
  791. return False, b""
  792. else:
  793. self._chunk_size = 0
  794. self.payload.feed_data(chunk[:required], required)
  795. chunk = chunk[required:]
  796. self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
  797. self.payload.end_http_chunk_receiving()
  798. # toss the CRLF at the end of the chunk
  799. if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
  800. if self._lax and chunk.startswith(b"\r"):
  801. chunk = chunk[1:]
  802. if chunk[: len(SEP)] == SEP:
  803. chunk = chunk[len(SEP) :]
  804. self._chunk = ChunkState.PARSE_CHUNKED_SIZE
  805. else:
  806. self._chunk_tail = chunk
  807. return False, b""
  808. if self._chunk == ChunkState.PARSE_TRAILERS:
  809. pos = chunk.find(SEP)
  810. if pos < 0: # No line found
  811. self._chunk_tail = chunk
  812. return False, b""
  813. line = chunk[:pos]
  814. chunk = chunk[pos + len(SEP) :]
  815. if SEP == b"\n": # For lax response parsing
  816. line = line.rstrip(b"\r")
  817. self._trailer_lines.append(line)
  818. # \r\n\r\n found, end of stream
  819. if self._trailer_lines[-1] == b"":
  820. # Headers and trailers are defined the same way,
  821. # so we reuse the HeadersParser here.
  822. try:
  823. trailers, raw_trailers = self._headers_parser.parse_headers(
  824. self._trailer_lines
  825. )
  826. finally:
  827. self._trailer_lines.clear()
  828. self.payload.feed_eof()
  829. return True, chunk
  830. # Read all bytes until eof
  831. elif self._type == ParseState.PARSE_UNTIL_EOF:
  832. self.payload.feed_data(chunk, len(chunk))
  833. return False, b""
  834. class DeflateBuffer:
  835. """DeflateStream decompress stream and feed data into specified stream."""
  836. decompressor: Any
  837. def __init__(
  838. self,
  839. out: StreamReader,
  840. encoding: Optional[str],
  841. max_decompress_size: int = DEFAULT_MAX_DECOMPRESS_SIZE,
  842. ) -> None:
  843. self.out = out
  844. self.size = 0
  845. out.total_compressed_bytes = self.size
  846. self.encoding = encoding
  847. self._started_decoding = False
  848. self.decompressor: Union[BrotliDecompressor, ZLibDecompressor, ZSTDDecompressor]
  849. if encoding == "br":
  850. if not HAS_BROTLI: # pragma: no cover
  851. raise ContentEncodingError(
  852. "Can not decode content-encoding: brotli (br). "
  853. "Please install `Brotli`"
  854. )
  855. self.decompressor = BrotliDecompressor()
  856. elif encoding == "zstd":
  857. if not HAS_ZSTD:
  858. raise ContentEncodingError(
  859. "Can not decode content-encoding: zstandard (zstd). "
  860. "Please install `backports.zstd`"
  861. )
  862. self.decompressor = ZSTDDecompressor()
  863. else:
  864. self.decompressor = ZLibDecompressor(encoding=encoding)
  865. self._max_decompress_size = max_decompress_size
  866. def set_exception(
  867. self,
  868. exc: BaseException,
  869. exc_cause: BaseException = _EXC_SENTINEL,
  870. ) -> None:
  871. set_exception(self.out, exc, exc_cause)
  872. def feed_data(self, chunk: bytes, size: int) -> None:
  873. if not size:
  874. return
  875. self.size += size
  876. self.out.total_compressed_bytes = self.size
  877. # RFC1950
  878. # bits 0..3 = CM = 0b1000 = 8 = "deflate"
  879. # bits 4..7 = CINFO = 1..7 = windows size.
  880. if (
  881. not self._started_decoding
  882. and self.encoding == "deflate"
  883. and chunk[0] & 0xF != 8
  884. ):
  885. # Change the decoder to decompress incorrectly compressed data
  886. # Actually we should issue a warning about non-RFC-compliant data.
  887. self.decompressor = ZLibDecompressor(
  888. encoding=self.encoding, suppress_deflate_header=True
  889. )
  890. try:
  891. # Decompress with limit + 1 so we can detect if output exceeds limit
  892. chunk = self.decompressor.decompress_sync(
  893. chunk, max_length=self._max_decompress_size + 1
  894. )
  895. except Exception:
  896. raise ContentEncodingError(
  897. "Can not decode content-encoding: %s" % self.encoding
  898. )
  899. self._started_decoding = True
  900. # Check if decompression limit was exceeded
  901. if len(chunk) > self._max_decompress_size:
  902. raise DecompressSizeError(
  903. "Decompressed data exceeds the configured limit of %d bytes"
  904. % self._max_decompress_size
  905. )
  906. if chunk:
  907. self.out.feed_data(chunk, len(chunk))
  908. def feed_eof(self) -> None:
  909. chunk = self.decompressor.flush()
  910. if chunk or self.size > 0:
  911. self.out.feed_data(chunk, len(chunk))
  912. if self.encoding == "deflate" and not self.decompressor.eof:
  913. raise ContentEncodingError("deflate")
  914. self.out.feed_eof()
  915. def begin_http_chunk_receiving(self) -> None:
  916. self.out.begin_http_chunk_receiving()
  917. def end_http_chunk_receiving(self) -> None:
  918. self.out.end_http_chunk_receiving()
  919. HttpRequestParserPy = HttpRequestParser
  920. HttpResponseParserPy = HttpResponseParser
  921. RawRequestMessagePy = RawRequestMessage
  922. RawResponseMessagePy = RawResponseMessage
  923. try:
  924. if not NO_EXTENSIONS:
  925. from ._http_parser import ( # type: ignore[import-not-found,no-redef]
  926. HttpRequestParser,
  927. HttpResponseParser,
  928. RawRequestMessage,
  929. RawResponseMessage,
  930. )
  931. HttpRequestParserC = HttpRequestParser
  932. HttpResponseParserC = HttpResponseParser
  933. RawRequestMessageC = RawRequestMessage
  934. RawResponseMessageC = RawResponseMessage
  935. except ImportError: # pragma: no cover
  936. pass