_parse.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. """URL parsing utilities."""
  2. import re
  3. import unicodedata
  4. from functools import lru_cache
  5. from urllib.parse import scheme_chars, uses_netloc
  6. from ._quoters import QUOTER, UNQUOTER_PLUS
  7. # Leading and trailing C0 control and space to be stripped per WHATWG spec.
  8. # == "".join([chr(i) for i in range(0, 0x20 + 1)])
  9. WHATWG_C0_CONTROL_OR_SPACE = (
  10. "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
  11. "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "
  12. )
  13. # Unsafe bytes to be removed per WHATWG spec
  14. UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]
  15. USES_AUTHORITY = frozenset(uses_netloc)
  16. SplitURLType = tuple[str, str, str, str, str]
  17. def split_url(url: str) -> SplitURLType:
  18. """Split URL into parts."""
  19. # Adapted from urllib.parse.urlsplit
  20. # Only lstrip url as some applications rely on preserving trailing space.
  21. # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
  22. url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)
  23. for b in UNSAFE_URL_BYTES_TO_REMOVE:
  24. if b in url:
  25. url = url.replace(b, "")
  26. scheme = netloc = query = fragment = ""
  27. i = url.find(":")
  28. if i > 0 and url[0] in scheme_chars:
  29. for c in url[1:i]:
  30. if c not in scheme_chars:
  31. break
  32. else:
  33. scheme, url = url[:i].lower(), url[i + 1 :]
  34. has_hash = "#" in url
  35. has_question_mark = "?" in url
  36. if url[:2] == "//":
  37. delim = len(url) # position of end of domain part of url, default is end
  38. if has_hash and has_question_mark:
  39. delim_chars = "/?#"
  40. elif has_question_mark:
  41. delim_chars = "/?"
  42. elif has_hash:
  43. delim_chars = "/#"
  44. else:
  45. delim_chars = "/"
  46. for c in delim_chars: # look for delimiters; the order is NOT important
  47. wdelim = url.find(c, 2) # find first of this delim
  48. if wdelim >= 0 and wdelim < delim: # if found
  49. delim = wdelim # use earliest delim position
  50. netloc = url[2:delim]
  51. url = url[delim:]
  52. has_left_bracket = "[" in netloc
  53. has_right_bracket = "]" in netloc
  54. if (has_left_bracket and not has_right_bracket) or (
  55. has_right_bracket and not has_left_bracket
  56. ):
  57. raise ValueError("Invalid IPv6 URL")
  58. if has_left_bracket:
  59. bracketed_host = netloc.partition("[")[2].partition("]")[0]
  60. # Valid bracketed hosts are defined in
  61. # https://www.rfc-editor.org/rfc/rfc3986#page-49
  62. # https://url.spec.whatwg.org/
  63. if bracketed_host and bracketed_host[0] == "v":
  64. if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):
  65. raise ValueError("IPvFuture address is invalid")
  66. elif ":" not in bracketed_host:
  67. raise ValueError("The IPv6 content between brackets is not valid")
  68. if has_hash:
  69. url, _, fragment = url.partition("#")
  70. if has_question_mark:
  71. url, _, query = url.partition("?")
  72. if netloc and not netloc.isascii():
  73. _check_netloc(netloc)
  74. return scheme, netloc, url, query, fragment
  75. def _check_netloc(netloc: str) -> None:
  76. # Adapted from urllib.parse._checknetloc
  77. # looking for characters like \u2100 that expand to 'a/c'
  78. # IDNA uses NFKC equivalence, so normalize for this check
  79. # ignore characters already included
  80. # but not the surrounding text
  81. n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")
  82. normalized_netloc = unicodedata.normalize("NFKC", n)
  83. if n == normalized_netloc:
  84. return
  85. # Note that there are no unicode decompositions for the character '@' so
  86. # its currently impossible to have test coverage for this branch, however if the
  87. # one should be added in the future we want to make sure its still checked.
  88. for c in "/?#@:": # pragma: no branch
  89. if c in normalized_netloc:
  90. raise ValueError(
  91. f"netloc '{netloc}' contains invalid "
  92. "characters under NFKC normalization"
  93. )
  94. @lru_cache # match the same size as urlsplit
  95. def split_netloc(
  96. netloc: str,
  97. ) -> tuple[str | None, str | None, str | None, int | None]:
  98. """Split netloc into username, password, host and port."""
  99. if "@" not in netloc:
  100. username: str | None = None
  101. password: str | None = None
  102. hostinfo = netloc
  103. else:
  104. userinfo, _, hostinfo = netloc.rpartition("@")
  105. username, have_password, password = userinfo.partition(":")
  106. if not have_password:
  107. password = None
  108. if "[" in hostinfo:
  109. _, _, bracketed = hostinfo.partition("[")
  110. hostname, _, port_str = bracketed.partition("]")
  111. _, _, port_str = port_str.partition(":")
  112. else:
  113. hostname, _, port_str = hostinfo.partition(":")
  114. if not port_str:
  115. return username or None, password, hostname or None, None
  116. try:
  117. port = int(port_str)
  118. except ValueError:
  119. raise ValueError("Invalid URL: port can't be converted to integer")
  120. if not (0 <= port <= 65535):
  121. raise ValueError("Port out of range 0-65535")
  122. return username or None, password, hostname or None, port
  123. def unsplit_result(
  124. scheme: str, netloc: str, url: str, query: str, fragment: str
  125. ) -> str:
  126. """Unsplit a URL without any normalization."""
  127. if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":
  128. if url and url[:1] != "/":
  129. url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"
  130. else:
  131. url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"
  132. elif scheme:
  133. url = f"{scheme}:{url}"
  134. if query:
  135. url = f"{url}?{query}"
  136. return f"{url}#{fragment}" if fragment else url
  137. @lru_cache # match the same size as urlsplit
  138. def make_netloc(
  139. user: str | None,
  140. password: str | None,
  141. host: str | None,
  142. port: int | None,
  143. encode: bool = False,
  144. ) -> str:
  145. """Make netloc from parts.
  146. The user and password are encoded if encode is True.
  147. The host must already be encoded with _encode_host.
  148. """
  149. if host is None:
  150. return ""
  151. ret = host
  152. if port is not None:
  153. ret = f"{ret}:{port}"
  154. if user is None and password is None:
  155. return ret
  156. if password is not None:
  157. if not user:
  158. user = ""
  159. elif encode:
  160. user = QUOTER(user)
  161. if encode:
  162. password = QUOTER(password)
  163. user = f"{user}:{password}"
  164. elif user and encode:
  165. user = QUOTER(user)
  166. return f"{user}@{ret}" if user else ret
  167. def query_to_pairs(query_string: str) -> list[tuple[str, str]]:
  168. """Parse a query given as a string argument.
  169. Works like urllib.parse.parse_qsl with keep empty values.
  170. """
  171. pairs: list[tuple[str, str]] = []
  172. if not query_string:
  173. return pairs
  174. for k_v in query_string.split("&"):
  175. k, _, v = k_v.partition("=")
  176. pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))
  177. return pairs