_quoting_c.pyx 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. from cpython.exc cimport PyErr_NoMemory
  2. from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
  3. from cpython.unicode cimport (
  4. PyUnicode_DATA,
  5. PyUnicode_DecodeASCII,
  6. PyUnicode_DecodeUTF8Stateful,
  7. PyUnicode_GET_LENGTH,
  8. PyUnicode_KIND,
  9. PyUnicode_READ,
  10. )
  11. from libc.stdint cimport uint8_t, uint64_t
  12. from libc.string cimport memcpy, memset
  13. from string import ascii_letters, digits
  14. cdef str GEN_DELIMS = ":/?#[]@"
  15. cdef str SUB_DELIMS_WITHOUT_QS = "!$'()*,"
  16. cdef str SUB_DELIMS = SUB_DELIMS_WITHOUT_QS + '+?=;'
  17. cdef str RESERVED = GEN_DELIMS + SUB_DELIMS
  18. cdef str UNRESERVED = ascii_letters + digits + '-._~'
  19. cdef str ALLOWED = UNRESERVED + SUB_DELIMS_WITHOUT_QS
  20. cdef str QS = '+&=;'
  21. DEF BUF_SIZE = 8 * 1024 # 8KiB
  22. cdef inline Py_UCS4 _to_hex(uint8_t v) noexcept:
  23. if v < 10:
  24. return <Py_UCS4>(v+0x30) # ord('0') == 0x30
  25. else:
  26. return <Py_UCS4>(v+0x41-10) # ord('A') == 0x41
  27. cdef inline int _from_hex(Py_UCS4 v) noexcept:
  28. if '0' <= v <= '9':
  29. return <int>(v) - 0x30 # ord('0') == 0x30
  30. elif 'A' <= v <= 'F':
  31. return <int>(v) - 0x41 + 10 # ord('A') == 0x41
  32. elif 'a' <= v <= 'f':
  33. return <int>(v) - 0x61 + 10 # ord('a') == 0x61
  34. else:
  35. return -1
  36. cdef inline int _is_lower_hex(Py_UCS4 v) noexcept:
  37. return 'a' <= v <= 'f'
  38. cdef inline long _restore_ch(Py_UCS4 d1, Py_UCS4 d2):
  39. cdef int digit1 = _from_hex(d1)
  40. if digit1 < 0:
  41. return -1
  42. cdef int digit2 = _from_hex(d2)
  43. if digit2 < 0:
  44. return -1
  45. return digit1 << 4 | digit2
  46. cdef uint8_t ALLOWED_TABLE[16]
  47. cdef uint8_t ALLOWED_NOTQS_TABLE[16]
  48. cdef inline bint bit_at(uint8_t array[], uint64_t ch) noexcept:
  49. return array[ch >> 3] & (1 << (ch & 7))
  50. cdef inline void set_bit(uint8_t array[], uint64_t ch) noexcept:
  51. array[ch >> 3] |= (1 << (ch & 7))
  52. memset(ALLOWED_TABLE, 0, sizeof(ALLOWED_TABLE))
  53. memset(ALLOWED_NOTQS_TABLE, 0, sizeof(ALLOWED_NOTQS_TABLE))
  54. for i in range(128):
  55. if chr(i) in ALLOWED:
  56. set_bit(ALLOWED_TABLE, i)
  57. set_bit(ALLOWED_NOTQS_TABLE, i)
  58. if chr(i) in QS:
  59. set_bit(ALLOWED_NOTQS_TABLE, i)
  60. # ----------------- writer ---------------------------
  61. cdef struct Writer:
  62. char *buf
  63. bint heap_allocated_buf
  64. Py_ssize_t size
  65. Py_ssize_t pos
  66. bint changed
  67. cdef inline void _init_writer(Writer* writer, char* buf):
  68. writer.buf = buf
  69. writer.heap_allocated_buf = False
  70. writer.size = BUF_SIZE
  71. writer.pos = 0
  72. writer.changed = 0
  73. cdef inline void _release_writer(Writer* writer):
  74. if writer.heap_allocated_buf:
  75. PyMem_Free(writer.buf)
  76. cdef inline int _write_char(Writer* writer, Py_UCS4 ch, bint changed):
  77. cdef char * buf
  78. cdef Py_ssize_t size
  79. if writer.pos == writer.size:
  80. # reallocate
  81. size = writer.size + BUF_SIZE
  82. if not writer.heap_allocated_buf:
  83. buf = <char*>PyMem_Malloc(size)
  84. if buf == NULL:
  85. PyErr_NoMemory()
  86. return -1
  87. memcpy(buf, writer.buf, writer.size)
  88. writer.heap_allocated_buf = True
  89. else:
  90. buf = <char*>PyMem_Realloc(writer.buf, size)
  91. if buf == NULL:
  92. PyErr_NoMemory()
  93. return -1
  94. writer.buf = buf
  95. writer.size = size
  96. writer.buf[writer.pos] = <char>ch
  97. writer.pos += 1
  98. writer.changed |= changed
  99. return 0
  100. cdef inline int _write_pct(Writer* writer, uint8_t ch, bint changed):
  101. if _write_char(writer, '%', changed) < 0:
  102. return -1
  103. if _write_char(writer, _to_hex(<uint8_t>ch >> 4), changed) < 0:
  104. return -1
  105. return _write_char(writer, _to_hex(<uint8_t>ch & 0x0f), changed)
  106. cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol):
  107. cdef uint64_t utf = <uint64_t> symbol
  108. if utf < 0x80:
  109. return _write_pct(writer, <uint8_t>utf, True)
  110. elif utf < 0x800:
  111. if _write_pct(writer, <uint8_t>(0xc0 | (utf >> 6)), True) < 0:
  112. return -1
  113. return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
  114. elif 0xD800 <= utf <= 0xDFFF:
  115. # surogate pair, ignored
  116. return 0
  117. elif utf < 0x10000:
  118. if _write_pct(writer, <uint8_t>(0xe0 | (utf >> 12)), True) < 0:
  119. return -1
  120. if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
  121. True) < 0:
  122. return -1
  123. return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
  124. elif utf > 0x10FFFF:
  125. # symbol is too large
  126. return 0
  127. else:
  128. if _write_pct(writer, <uint8_t>(0xf0 | (utf >> 18)), True) < 0:
  129. return -1
  130. if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 12) & 0x3f)),
  131. True) < 0:
  132. return -1
  133. if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
  134. True) < 0:
  135. return -1
  136. return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
  137. # --------------------- end writer --------------------------
  138. cdef class _Quoter:
  139. cdef bint _qs
  140. cdef bint _requote
  141. cdef uint8_t _safe_table[16]
  142. cdef uint8_t _protected_table[16]
  143. def __init__(
  144. self, *, str safe='', str protected='', bint qs=False, bint requote=True,
  145. ):
  146. cdef Py_UCS4 ch
  147. self._qs = qs
  148. self._requote = requote
  149. if not self._qs:
  150. memcpy(self._safe_table,
  151. ALLOWED_NOTQS_TABLE,
  152. sizeof(self._safe_table))
  153. else:
  154. memcpy(self._safe_table,
  155. ALLOWED_TABLE,
  156. sizeof(self._safe_table))
  157. for ch in safe:
  158. if ord(ch) > 127:
  159. raise ValueError("Only safe symbols with ORD < 128 are allowed")
  160. set_bit(self._safe_table, ch)
  161. memset(self._protected_table, 0, sizeof(self._protected_table))
  162. for ch in protected:
  163. if ord(ch) > 127:
  164. raise ValueError("Only safe symbols with ORD < 128 are allowed")
  165. set_bit(self._safe_table, ch)
  166. set_bit(self._protected_table, ch)
  167. def __call__(self, val):
  168. if val is None:
  169. return None
  170. if type(val) is not str:
  171. if isinstance(val, str):
  172. # derived from str
  173. val = str(val)
  174. else:
  175. raise TypeError("Argument should be str")
  176. return self._do_quote_or_skip(<str>val)
  177. cdef str _do_quote_or_skip(self, str val):
  178. cdef char[BUF_SIZE] buffer
  179. cdef Py_UCS4 ch
  180. cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val)
  181. cdef Py_ssize_t idx = length
  182. cdef bint must_quote = 0
  183. cdef Writer writer
  184. cdef int kind = PyUnicode_KIND(val)
  185. cdef const void *data = PyUnicode_DATA(val)
  186. # If everything in the string is in the safe
  187. # table and all ASCII, we can skip quoting
  188. while idx:
  189. idx -= 1
  190. ch = PyUnicode_READ(kind, data, idx)
  191. if ch >= 128 or not bit_at(self._safe_table, ch):
  192. must_quote = 1
  193. break
  194. if not must_quote:
  195. return val
  196. _init_writer(&writer, &buffer[0])
  197. try:
  198. return self._do_quote(<str>val, length, kind, data, &writer)
  199. finally:
  200. _release_writer(&writer)
  201. cdef str _do_quote(
  202. self,
  203. str val,
  204. Py_ssize_t length,
  205. int kind,
  206. const void *data,
  207. Writer *writer
  208. ):
  209. cdef Py_UCS4 ch
  210. cdef long chl
  211. cdef int changed
  212. cdef Py_ssize_t idx = 0
  213. while idx < length:
  214. ch = PyUnicode_READ(kind, data, idx)
  215. idx += 1
  216. if ch == '%' and self._requote and idx <= length - 2:
  217. chl = _restore_ch(
  218. PyUnicode_READ(kind, data, idx),
  219. PyUnicode_READ(kind, data, idx + 1)
  220. )
  221. if chl != -1:
  222. ch = <Py_UCS4>chl
  223. idx += 2
  224. if ch < 128:
  225. if bit_at(self._protected_table, ch):
  226. if _write_pct(writer, ch, True) < 0:
  227. raise
  228. continue
  229. if bit_at(self._safe_table, ch):
  230. if _write_char(writer, ch, True) < 0:
  231. raise
  232. continue
  233. changed = (_is_lower_hex(PyUnicode_READ(kind, data, idx - 2)) or
  234. _is_lower_hex(PyUnicode_READ(kind, data, idx - 1)))
  235. if _write_pct(writer, ch, changed) < 0:
  236. raise
  237. continue
  238. else:
  239. ch = '%'
  240. if self._write(writer, ch) < 0:
  241. raise
  242. if not writer.changed:
  243. return val
  244. else:
  245. return PyUnicode_DecodeASCII(writer.buf, writer.pos, "strict")
  246. cdef inline int _write(self, Writer *writer, Py_UCS4 ch):
  247. if self._qs:
  248. if ch == ' ':
  249. return _write_char(writer, '+', True)
  250. if ch < 128 and bit_at(self._safe_table, ch):
  251. return _write_char(writer, ch, False)
  252. return _write_utf8(writer, ch)
  253. cdef class _Unquoter:
  254. cdef str _ignore
  255. cdef bint _has_ignore
  256. cdef str _unsafe
  257. cdef bytes _unsafe_bytes
  258. cdef Py_ssize_t _unsafe_bytes_len
  259. cdef const unsigned char * _unsafe_bytes_char
  260. cdef bint _qs
  261. cdef bint _plus # to match urllib.parse.unquote_plus
  262. cdef _Quoter _quoter
  263. cdef _Quoter _qs_quoter
  264. def __init__(self, *, ignore="", unsafe="", qs=False, plus=False):
  265. self._ignore = ignore
  266. self._has_ignore = bool(self._ignore)
  267. self._unsafe = unsafe
  268. # unsafe may only be extended ascii characters (0-255)
  269. self._unsafe_bytes = self._unsafe.encode('ascii')
  270. self._unsafe_bytes_len = len(self._unsafe_bytes)
  271. self._unsafe_bytes_char = self._unsafe_bytes
  272. self._qs = qs
  273. self._plus = plus
  274. self._quoter = _Quoter()
  275. self._qs_quoter = _Quoter(qs=True)
  276. def __call__(self, val):
  277. if val is None:
  278. return None
  279. if type(val) is not str:
  280. if isinstance(val, str):
  281. # derived from str
  282. val = str(val)
  283. else:
  284. raise TypeError("Argument should be str")
  285. return self._do_unquote(<str>val)
  286. cdef str _do_unquote(self, str val):
  287. cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val)
  288. if length == 0:
  289. return val
  290. cdef list ret = []
  291. cdef char buffer[4]
  292. cdef Py_ssize_t buflen = 0
  293. cdef Py_ssize_t consumed
  294. cdef str unquoted
  295. cdef Py_UCS4 ch = 0
  296. cdef long chl = 0
  297. cdef Py_ssize_t idx = 0
  298. cdef Py_ssize_t start_pct
  299. cdef int kind = PyUnicode_KIND(val)
  300. cdef const void *data = PyUnicode_DATA(val)
  301. cdef bint changed = 0
  302. while idx < length:
  303. ch = PyUnicode_READ(kind, data, idx)
  304. idx += 1
  305. if ch == '%' and idx <= length - 2:
  306. changed = 1
  307. chl = _restore_ch(
  308. PyUnicode_READ(kind, data, idx),
  309. PyUnicode_READ(kind, data, idx + 1)
  310. )
  311. if chl != -1:
  312. ch = <Py_UCS4>chl
  313. idx += 2
  314. assert buflen < 4
  315. buffer[buflen] = ch
  316. buflen += 1
  317. try:
  318. unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
  319. NULL, &consumed)
  320. except UnicodeDecodeError:
  321. start_pct = idx - buflen * 3
  322. buffer[0] = ch
  323. buflen = 1
  324. ret.append(val[start_pct : idx - 3])
  325. try:
  326. unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
  327. NULL, &consumed)
  328. except UnicodeDecodeError:
  329. buflen = 0
  330. ret.append(val[idx - 3 : idx])
  331. continue
  332. if not unquoted:
  333. assert consumed == 0
  334. continue
  335. assert consumed == buflen
  336. buflen = 0
  337. if self._qs and unquoted in '+=&;':
  338. ret.append(self._qs_quoter(unquoted))
  339. elif (
  340. (self._unsafe_bytes_len and unquoted in self._unsafe) or
  341. (self._has_ignore and unquoted in self._ignore)
  342. ):
  343. ret.append(self._quoter(unquoted))
  344. else:
  345. ret.append(unquoted)
  346. continue
  347. else:
  348. ch = '%'
  349. if buflen:
  350. start_pct = idx - 1 - buflen * 3
  351. ret.append(val[start_pct : idx - 1])
  352. buflen = 0
  353. if ch == '+':
  354. if (
  355. (not self._qs and not self._plus) or
  356. (self._unsafe_bytes_len and self._is_char_unsafe(ch))
  357. ):
  358. ret.append('+')
  359. else:
  360. changed = 1
  361. ret.append(' ')
  362. continue
  363. if self._unsafe_bytes_len and self._is_char_unsafe(ch):
  364. changed = 1
  365. ret.append('%')
  366. h = hex(ord(ch)).upper()[2:]
  367. for ch in h:
  368. ret.append(ch)
  369. continue
  370. ret.append(ch)
  371. if not changed:
  372. return val
  373. if buflen:
  374. ret.append(val[length - buflen * 3 : length])
  375. return ''.join(ret)
  376. cdef inline bint _is_char_unsafe(self, Py_UCS4 ch):
  377. for i in range(self._unsafe_bytes_len):
  378. if ch == self._unsafe_bytes_char[i]:
  379. return True
  380. return False