| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071 |
- # Parsers for XML and HTML
- from lxml.includes cimport xmlparser
- from lxml.includes cimport htmlparser
- cdef object _GenericAlias
- try:
- from types import GenericAlias as _GenericAlias
- except ImportError:
- # Python 3.8 - we only need this as return value from "__class_getitem__"
- def _GenericAlias(cls, item):
- return f"{cls.__name__}[{item.__name__}]"
- class ParseError(LxmlSyntaxError):
- """Syntax error while parsing an XML document.
- For compatibility with ElementTree 1.3 and later.
- """
- def __init__(self, message, code, line, column, filename=None):
- super(_ParseError, self).__init__(message)
- self.lineno, self.offset = (line, column - 1)
- self.code = code
- self.filename = filename
- @property
- def position(self):
- return self.lineno, self.offset + 1
- @position.setter
- def position(self, new_pos):
- self.lineno, column = new_pos
- self.offset = column - 1
- cdef object _ParseError = ParseError
- class XMLSyntaxError(ParseError):
- """Syntax error while parsing an XML document.
- """
- cdef class ParserError(LxmlError):
- """Internal lxml parser error.
- """
- @cython.final
- @cython.internal
- cdef class _ParserDictionaryContext:
- # Global parser context to share the string dictionary.
- #
- # This class is a delegate singleton!
- #
- # It creates _ParserDictionaryContext objects for each thread to keep thread state,
- # but those must never be used directly. Always stick to using the static
- # __GLOBAL_PARSER_CONTEXT as defined below the class.
- #
- cdef tree.xmlDict* _c_dict
- cdef _BaseParser _default_parser
- cdef list _implied_parser_contexts
- def __cinit__(self):
- self._implied_parser_contexts = []
- def __dealloc__(self):
- if self._c_dict is not NULL:
- xmlparser.xmlDictFree(self._c_dict)
- cdef int initMainParserContext(self) except -1:
- """Put the global context into the thread dictionary of the main
- thread. To be called once and only in the main thread."""
- thread_dict = python.PyThreadState_GetDict()
- if thread_dict is not NULL:
- (<dict>thread_dict)["_ParserDictionaryContext"] = self
- cdef _ParserDictionaryContext _findThreadParserContext(self):
- "Find (or create) the _ParserDictionaryContext object for the current thread"
- cdef _ParserDictionaryContext context
- thread_dict = python.PyThreadState_GetDict()
- if thread_dict is NULL:
- return self
- d = <dict>thread_dict
- result = python.PyDict_GetItem(d, "_ParserDictionaryContext")
- if result is not NULL:
- return <object>result
- context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
- d["_ParserDictionaryContext"] = context
- return context
- cdef int setDefaultParser(self, _BaseParser parser) except -1:
- "Set the default parser for the current thread"
- cdef _ParserDictionaryContext context
- context = self._findThreadParserContext()
- context._default_parser = parser
- cdef _BaseParser getDefaultParser(self):
- "Return (or create) the default parser of the current thread"
- cdef _ParserDictionaryContext context
- context = self._findThreadParserContext()
- if context._default_parser is None:
- if self._default_parser is None:
- self._default_parser = __DEFAULT_XML_PARSER._copy()
- if context is not self:
- context._default_parser = self._default_parser._copy()
- return context._default_parser
- cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
- "Return the thread-local dict or create a new one if necessary."
- cdef _ParserDictionaryContext context
- context = self._findThreadParserContext()
- if context._c_dict is NULL:
- # thread dict not yet set up => use default or create a new one
- if default is not NULL:
- context._c_dict = default
- xmlparser.xmlDictReference(default)
- return default
- if self._c_dict is NULL:
- self._c_dict = xmlparser.xmlDictCreate()
- if context is not self:
- context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
- return context._c_dict
- cdef int initThreadDictRef(self, tree.xmlDict** c_dict_ref) except -1:
- c_dict = c_dict_ref[0]
- c_thread_dict = self._getThreadDict(c_dict)
- if c_dict is c_thread_dict:
- return 0
- if c_dict is not NULL:
- xmlparser.xmlDictFree(c_dict)
- c_dict_ref[0] = c_thread_dict
- xmlparser.xmlDictReference(c_thread_dict)
- cdef int initParserDict(self, xmlparser.xmlParserCtxt* pctxt) except -1:
- "Assure we always use the same string dictionary."
- self.initThreadDictRef(&pctxt.dict)
- pctxt.dictNames = 1
- cdef int initXPathParserDict(self, xpath.xmlXPathContext* pctxt) except -1:
- "Assure we always use the same string dictionary."
- self.initThreadDictRef(&pctxt.dict)
- cdef int initDocDict(self, xmlDoc* result) except -1:
- "Store dict of last object parsed if no shared dict yet"
- # XXX We also free the result dict here if there already was one.
- # This case should only occur for new documents with empty dicts,
- # otherwise we'd free data that's in use => segfault
- self.initThreadDictRef(&result.dict)
- cdef _ParserContext findImpliedContext(self):
- """Return any current implied xml parser context for the current
- thread. This is used when the resolver functions are called
- with an xmlParserCtxt that was generated from within libxml2
- (i.e. without a _ParserContext) - which happens when parsing
- schema and xinclude external references."""
- cdef _ParserDictionaryContext context
- cdef _ParserContext implied_context
- # see if we have a current implied parser
- context = self._findThreadParserContext()
- if context._implied_parser_contexts:
- implied_context = context._implied_parser_contexts[-1]
- return implied_context
- return None
- cdef int pushImpliedContextFromParser(self, _BaseParser parser) except -1:
- "Push a new implied context object taken from the parser."
- if parser is not None:
- self.pushImpliedContext(parser._getParserContext())
- else:
- self.pushImpliedContext(None)
- cdef int pushImpliedContext(self, _ParserContext parser_context) except -1:
- "Push a new implied context object."
- cdef _ParserDictionaryContext context
- context = self._findThreadParserContext()
- context._implied_parser_contexts.append(parser_context)
- cdef int popImpliedContext(self) except -1:
- "Pop the current implied context object."
- cdef _ParserDictionaryContext context
- context = self._findThreadParserContext()
- context._implied_parser_contexts.pop()
- cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
- __GLOBAL_PARSER_CONTEXT.initMainParserContext()
- ############################################################
- ## support for Python unicode I/O
- ############################################################
- # name of Python Py_UNICODE encoding as known to libxml2
- cdef const_char* _PY_UNICODE_ENCODING = NULL
- cdef int _setupPythonUnicode() except -1:
- """Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode
- strings if libxml2 supports reading native Python unicode. This depends
- on iconv and the local Python installation, so we simply check if we find
- a matching encoding handler.
- """
- cdef tree.xmlCharEncodingHandler* enchandler
- cdef Py_ssize_t l
- cdef const_char* enc
- cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
- cdef const_xmlChar* buffer = <const_xmlChar*>uchars
- # apparently, libxml2 can't detect UTF-16 on some systems
- if (buffer[0] == c'<' and buffer[1] == c'\0' and
- buffer[2] == c't' and buffer[3] == c'\0'):
- enc = "UTF-16LE"
- elif (buffer[0] == c'\0' and buffer[1] == c'<' and
- buffer[2] == c'\0' and buffer[3] == c't'):
- enc = "UTF-16BE"
- else:
- # let libxml2 give it a try
- enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
- if enc is NULL:
- # not my fault, it's YOUR broken system :)
- return 0
- enchandler = tree.xmlFindCharEncodingHandler(enc)
- if enchandler is not NULL:
- global _PY_UNICODE_ENCODING
- tree.xmlCharEncCloseFunc(enchandler)
- _PY_UNICODE_ENCODING = enc
- return 0
- cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
- "Work around bug in libxml2: find iconv name of encoding on our own."
- cdef tree.xmlCharEncoding enc
- enc = tree.xmlDetectCharEncoding(buffer, size)
- if enc == tree.XML_CHAR_ENCODING_UTF16LE:
- if size >= 4 and (buffer[0] == <const_xmlChar> b'\xFF' and
- buffer[1] == <const_xmlChar> b'\xFE' and
- buffer[2] == 0 and buffer[3] == 0):
- return "UTF-32LE" # according to BOM
- else:
- return "UTF-16LE"
- elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
- return "UTF-16BE"
- elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
- return "UCS-4LE"
- elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
- return "UCS-4BE"
- elif enc == tree.XML_CHAR_ENCODING_NONE:
- return NULL
- else:
- # returns a constant char*, no need to free it
- return tree.xmlGetCharEncodingName(enc)
- # Python 3.12 removed support for "Py_UNICODE".
- if python.PY_VERSION_HEX < 0x030C0000:
- _setupPythonUnicode()
- cdef unicode _find_PyUCS4EncodingName():
- """
- Find a suitable encoding for Py_UCS4 PyUnicode strings in libxml2.
- """
- ustring = "<xml>\U0001F92A</xml>"
- cdef const xmlChar* buffer = <const xmlChar*> python.PyUnicode_DATA(ustring)
- cdef Py_ssize_t py_buffer_len = python.PyUnicode_GET_LENGTH(ustring)
- encoding_name = ''
- cdef tree.xmlCharEncoding enc = tree.xmlDetectCharEncoding(buffer, py_buffer_len)
- enchandler = tree.xmlGetCharEncodingHandler(enc)
- if enchandler is not NULL:
- try:
- if enchandler.name:
- encoding_name = enchandler.name.decode('UTF-8')
- finally:
- tree.xmlCharEncCloseFunc(enchandler)
- else:
- c_name = tree.xmlGetCharEncodingName(enc)
- if c_name:
- encoding_name = c_name.decode('UTF-8')
- if encoding_name and not encoding_name.endswith('LE') and not encoding_name.endswith('BE'):
- encoding_name += 'BE' if python.PY_BIG_ENDIAN else 'LE'
- return encoding_name or None
- _pyucs4_encoding_name = _find_PyUCS4EncodingName()
- ############################################################
- ## support for file-like objects
- ############################################################
- @cython.final
- @cython.internal
- cdef class _FileReaderContext:
- cdef object _filelike
- cdef object _encoding
- cdef object _url
- cdef object _bytes
- cdef _ExceptionContext _exc_context
- cdef Py_ssize_t _bytes_read
- cdef char* _c_url
- cdef bint _close_file_after_read
- def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
- self._exc_context = exc_context
- self._filelike = filelike
- self._close_file_after_read = close_file
- self._encoding = encoding
- if url is not None:
- url = _encodeFilename(url)
- self._c_url = _cstr(url)
- self._url = url
- self._bytes = b''
- self._bytes_read = 0
- cdef _close_file(self):
- if self._filelike is None or not self._close_file_after_read:
- return
- try:
- close = self._filelike.close
- except AttributeError:
- close = None
- finally:
- self._filelike = None
- if close is not None:
- close()
- cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self) noexcept:
- cdef xmlparser.xmlParserInputBuffer* c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
- if c_buffer:
- c_buffer.readcallback = _readFilelikeParser
- c_buffer.context = <python.PyObject*> self
- return c_buffer
- cdef xmlparser.xmlParserInput* _createParserInput(
- self, xmlparser.xmlParserCtxt* ctxt) noexcept:
- cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
- if not c_buffer:
- return NULL
- return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
- cdef tree.xmlDtd* _readDtd(self) noexcept:
- cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
- if not c_buffer:
- return NULL
- with nogil:
- return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
- cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options) noexcept:
- cdef xmlDoc* result
- cdef void* c_callback_context = <python.PyObject*> self
- cdef char* c_encoding = _cstr(self._encoding) if self._encoding is not None else NULL
- orig_options = ctxt.options
- with nogil:
- if ctxt.html:
- result = htmlparser.htmlCtxtReadIO(
- ctxt, _readFilelikeParser, NULL, c_callback_context,
- self._c_url, c_encoding, options)
- if result is not NULL:
- if _fixHtmlDictNames(ctxt.dict, result) < 0:
- tree.xmlFreeDoc(result)
- result = NULL
- else:
- result = xmlparser.xmlCtxtReadIO(
- ctxt, _readFilelikeParser, NULL, c_callback_context,
- self._c_url, c_encoding, options)
- ctxt.options = orig_options # work around libxml2 problem
- try:
- self._close_file()
- except:
- self._exc_context._store_raised()
- finally:
- return result # swallow any exceptions
- cdef int copyToBuffer(self, char* c_buffer, int c_requested) noexcept:
- cdef int c_byte_count = 0
- cdef char* c_start
- cdef Py_ssize_t byte_count, remaining
- if self._bytes_read < 0:
- return 0
- try:
- byte_count = python.PyBytes_GET_SIZE(self._bytes)
- remaining = byte_count - self._bytes_read
- while c_requested > remaining:
- c_start = _cstr(self._bytes) + self._bytes_read
- cstring_h.memcpy(c_buffer, c_start, remaining)
- c_byte_count += remaining
- c_buffer += remaining
- c_requested -= remaining
- self._bytes = self._filelike.read(c_requested)
- if not isinstance(self._bytes, bytes):
- if isinstance(self._bytes, unicode):
- if self._encoding is None:
- self._bytes = (<unicode>self._bytes).encode('utf8')
- else:
- self._bytes = python.PyUnicode_AsEncodedString(
- self._bytes, _cstr(self._encoding), NULL)
- else:
- self._close_file()
- raise TypeError, \
- "reading from file-like objects must return byte strings or unicode strings"
- remaining = python.PyBytes_GET_SIZE(self._bytes)
- if remaining == 0:
- self._bytes_read = -1
- self._close_file()
- return c_byte_count
- self._bytes_read = 0
- if c_requested > 0:
- c_start = _cstr(self._bytes) + self._bytes_read
- cstring_h.memcpy(c_buffer, c_start, c_requested)
- c_byte_count += c_requested
- self._bytes_read += c_requested
- except:
- c_byte_count = -1
- self._exc_context._store_raised()
- try:
- self._close_file()
- except:
- self._exc_context._store_raised()
- finally:
- return c_byte_count # swallow any exceptions
- cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil:
- return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
- ############################################################
- ## support for custom document loaders
- ############################################################
- cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
- xmlparser.xmlParserCtxt* c_context) noexcept with gil:
- cdef _ResolverContext context
- cdef xmlparser.xmlParserInput* c_input
- cdef _InputDocument doc_ref
- cdef _FileReaderContext file_context
- # if there is no _ParserContext associated with the xmlParserCtxt
- # passed, check to see if the thread state object has an implied
- # context.
- if c_context._private is not NULL:
- context = <_ResolverContext>c_context._private
- else:
- context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
- if context is None:
- if __DEFAULT_ENTITY_LOADER is NULL:
- return NULL
- with nogil:
- # free the GIL as we might do serious I/O here (e.g. HTTP)
- c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
- return c_input
- try:
- if c_url is NULL:
- url = None
- else:
- # parsing a related document (DTD etc.) => UTF-8 encoded URL?
- url = _decodeFilename(<const_xmlChar*>c_url)
- if c_pubid is NULL:
- pubid = None
- else:
- pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
- doc_ref = context._resolvers.resolve(url, pubid, context)
- except:
- context._store_raised()
- return NULL
- if doc_ref is not None:
- if doc_ref._type == PARSER_DATA_STRING:
- data = doc_ref._data_bytes
- filename = doc_ref._filename
- if not filename:
- filename = None
- elif not isinstance(filename, bytes):
- # most likely a text URL
- filename = filename.encode('utf8')
- if not isinstance(filename, bytes):
- filename = None
- if tree.LIBXML_VERSION >= 21400:
- c_filename = <char *>tree.xmlStrdup(_xcstr(filename)) if filename is not None else NULL
- c_input = xmlparser.xmlNewInputFromMemory(
- c_filename, _xcstr(data), <size_t> python.PyBytes_GET_SIZE(data), 0)
- else:
- c_input = xmlparser.xmlNewInputStream(c_context)
- if c_input is not NULL:
- if filename is not None:
- c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
- c_input.base = _xcstr(data)
- c_input.length = python.PyBytes_GET_SIZE(data)
- c_input.cur = c_input.base
- c_input.end = c_input.base + c_input.length
- elif doc_ref._type == PARSER_DATA_FILENAME:
- data = None
- c_filename = _cstr(doc_ref._filename)
- with nogil:
- # free the GIL as we might do serious I/O here
- c_input = xmlparser.xmlNewInputFromFile(
- c_context, c_filename)
- elif doc_ref._type == PARSER_DATA_FILE:
- file_context = _FileReaderContext(doc_ref._file, context, url,
- None, doc_ref._close_file)
- c_input = file_context._createParserInput(c_context)
- data = file_context
- else:
- data = None
- c_input = NULL
- if data is not None:
- context._storage.add(data)
- if c_input is not NULL:
- return c_input
- if __DEFAULT_ENTITY_LOADER is NULL:
- return NULL
- with nogil:
- # free the GIL as we might do serious I/O here (e.g. HTTP)
- c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
- return c_input
- cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
- __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
- cdef xmlparser.xmlExternalEntityLoader _register_document_loader() noexcept nogil:
- cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
- xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
- return old
- cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) noexcept nogil:
- xmlparser.xmlSetExternalEntityLoader(old)
- ############################################################
- ## Parsers
- ############################################################
- @cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc.
- @cython.internal
- cdef class _ParserContext(_ResolverContext):
- cdef _ErrorLog _error_log
- cdef _ParserSchemaValidationContext _validator
- cdef xmlparser.xmlParserCtxt* _c_ctxt
- cdef xmlparser.xmlExternalEntityLoader _orig_loader
- cdef python.PyThread_type_lock _lock
- cdef _Document _doc
- cdef bint _collect_ids
- def __cinit__(self):
- self._collect_ids = True
- if config.ENABLE_THREADING:
- self._lock = python.PyThread_allocate_lock()
- self._error_log = _ErrorLog()
- def __dealloc__(self):
- if config.ENABLE_THREADING and self._lock is not NULL:
- python.PyThread_free_lock(self._lock)
- self._lock = NULL
- if self._c_ctxt is not NULL:
- if <void*>self._validator is not NULL and self._validator is not None:
- # If the parser was not closed correctly (e.g. interrupted iterparse()),
- # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
- # validator plug might still be in place, which will make xmlFreeParserCtxt()
- # crash when trying to xmlFree() a static SAX handler.
- # Thus, make sure we disconnect the handler interceptor here at the latest.
- self._validator.disconnect()
- xmlparser.xmlFreeParserCtxt(self._c_ctxt)
- cdef _ParserContext _copy(self):
- cdef _ParserContext context
- context = self.__class__()
- context._collect_ids = self._collect_ids
- context._validator = self._validator.copy()
- _initParserContext(context, self._resolvers._copy(), NULL)
- return context
- cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
- """
- Connects the libxml2-level context to the lxml-level parser context.
- """
- self._c_ctxt = c_ctxt
- c_ctxt._private = <void*>self
- cdef void _resetParserContext(self) noexcept:
- if self._c_ctxt is not NULL:
- if self._c_ctxt.html:
- htmlparser.htmlCtxtReset(self._c_ctxt)
- self._c_ctxt.disableSAX = 0 # work around bug in libxml2
- else:
- xmlparser.xmlClearParserCtxt(self._c_ctxt)
- # work around bug in libxml2 [2.9.10 .. 2.9.14]:
- # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
- self._c_ctxt.nsNr = 0
- cdef int prepare(self, bint set_document_loader=True) except -1:
- cdef int result
- if config.ENABLE_THREADING and self._lock is not NULL:
- with nogil:
- result = python.PyThread_acquire_lock(
- self._lock, python.WAIT_LOCK)
- if result == 0:
- raise ParserError, "parser locking failed"
- self._error_log.clear()
- self._doc = None
- # Connect the lxml error log with libxml2's error handling. In the case of parsing
- # HTML, ctxt->sax is not set to null, so this always works. The libxml2 function
- # that does this is htmlInitParserCtxt in HTMLparser.c. For HTML (and possibly XML
- # too), libxml2's SAX's serror is set to be the place where errors are sent when
- # schannel is set to ctxt->sax->serror in xmlCtxtErrMemory in libxml2's
- # parserInternals.c.
- # Need a cast here because older libxml2 releases do not use 'const' in the functype.
- self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
- self._orig_loader = _register_document_loader() if set_document_loader else NULL
- if self._validator is not None:
- self._validator.connect(self._c_ctxt, self._error_log)
- return 0
- cdef int cleanup(self) except -1:
- if self._orig_loader is not NULL:
- _reset_document_loader(self._orig_loader)
- try:
- if self._validator is not None:
- self._validator.disconnect()
- self._resetParserContext()
- self.clear()
- self._doc = None
- self._c_ctxt.sax.serror = NULL
- finally:
- if config.ENABLE_THREADING and self._lock is not NULL:
- python.PyThread_release_lock(self._lock)
- return 0
- cdef object _handleParseResult(self, _BaseParser parser,
- xmlDoc* result, filename):
- c_doc = self._handleParseResultDoc(parser, result, filename)
- if self._doc is not None and self._doc._c_doc is c_doc:
- return self._doc
- else:
- return _documentFactory(c_doc, parser)
- cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
- xmlDoc* result, filename) except NULL:
- recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(self, self._c_ctxt, result,
- filename, recover,
- free_doc=self._doc is None)
- cdef _initParserContext(_ParserContext context,
- _ResolverRegistry resolvers,
- xmlparser.xmlParserCtxt* c_ctxt):
- _initResolverContext(context, resolvers)
- if c_ctxt is not NULL:
- context._initParserContext(c_ctxt)
- cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil:
- """
- Add an error created by libxml2 to the lxml-level error_log.
- """
- (<_ParserContext>_parser_context._private)._error_log._receive(error)
- cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
- if __DEBUG:
- if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
- _forwardError(NULL, error)
- else:
- _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
- cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
- _ErrorLog error_log) except -1:
- if filename is not None and \
- ctxt.lastError.domain == xmlerror.XML_FROM_IO:
- if isinstance(filename, bytes):
- filename = _decodeFilenameWithLength(
- <bytes>filename, len(<bytes>filename))
- if ctxt.lastError.message is not NULL:
- try:
- message = ctxt.lastError.message.decode('utf-8')
- except UnicodeDecodeError:
- # the filename may be in there => play it safe
- message = ctxt.lastError.message.decode('iso8859-1')
- message = f"Error reading file '{filename}': {message.strip()}"
- else:
- message = f"Error reading '{filename}'"
- raise IOError, message
- elif error_log:
- raise error_log._buildParseException(
- XMLSyntaxError, "Document is not well formed")
- elif ctxt.lastError.message is not NULL:
- message = ctxt.lastError.message.strip()
- code = ctxt.lastError.code
- line = ctxt.lastError.line
- column = ctxt.lastError.int2
- if ctxt.lastError.line > 0:
- message = f"line {line}: {message}"
- raise XMLSyntaxError(message, code, line, column, filename)
- else:
- raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
- filename)
- cdef xmlDoc* _handleParseResult(_ParserContext context,
- xmlparser.xmlParserCtxt* c_ctxt,
- xmlDoc* result, filename,
- bint recover, bint free_doc) except NULL:
- # The C-level argument xmlDoc* result is passed in as NULL if the parser was not able
- # to parse the document.
- cdef bint well_formed
- if result is not NULL:
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
- if c_ctxt.myDoc is not NULL:
- if c_ctxt.myDoc is not result:
- __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
- tree.xmlFreeDoc(c_ctxt.myDoc)
- c_ctxt.myDoc = NULL
- if result is not NULL:
- # "wellFormed" in libxml2 is 0 if the parser found fatal errors. It still returns a
- # parse result document if 'recover=True'. Here, we determine if we can present
- # the document to the user or consider it incorrect or broken enough to raise an error.
- if (context._validator is not None and
- not context._validator.isvalid()):
- well_formed = 0 # actually not 'valid', but anyway ...
- elif (not c_ctxt.wellFormed and not c_ctxt.html and
- c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
- [1 for error in context._error_log
- if error.type == ErrorTypes.ERR_INVALID_CHAR]):
- # An encoding error occurred and libxml2 switched from UTF-8
- # input to (undecoded) Latin-1, at some arbitrary point in the
- # document. Better raise an error than allowing for a broken
- # tree with mixed encodings. This is fixed in libxml2 2.12.
- well_formed = 0
- elif recover or (c_ctxt.wellFormed and
- c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
- well_formed = 1
- elif not c_ctxt.replaceEntities and not c_ctxt.validate \
- and context is not None:
- # in this mode, we ignore errors about undefined entities
- for error in context._error_log.filter_from_errors():
- if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
- error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
- well_formed = 0
- break
- else:
- well_formed = 1
- else:
- well_formed = 0
- if not well_formed:
- if free_doc:
- tree.xmlFreeDoc(result)
- result = NULL
- if context is not None and context._has_raised():
- if result is not NULL:
- if free_doc:
- tree.xmlFreeDoc(result)
- result = NULL
- context._raise_if_stored()
- if result is NULL:
- if context is not None:
- _raiseParseError(c_ctxt, filename, context._error_log)
- else:
- _raiseParseError(c_ctxt, filename, None)
- else:
- if result.URL is NULL and filename is not None:
- result.URL = tree.xmlStrdup(_xcstr(filename))
- if result.encoding is NULL:
- result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
- if context._validator is not None and \
- context._validator._add_default_attributes:
- # we currently need to do this here as libxml2 does not
- # support inserting default attributes during parse-time
- # validation
- context._validator.inject_default_attributes(result)
- return result
- cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) noexcept nogil:
- cdef xmlNode* c_node
- if c_doc is NULL:
- return 0
- c_node = c_doc.children
- tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
- if c_node.type == tree.XML_ELEMENT_NODE:
- if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
- return -1
- tree.END_FOR_EACH_ELEMENT_FROM(c_node)
- return 0
- cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
- xmlNode* c_start_node) noexcept nogil:
- """
- Move names to the dict, iterating in document order, starting at
- c_start_node. This is used in incremental parsing after each chunk.
- """
- cdef xmlNode* c_node
- if not c_doc:
- return 0
- if not c_start_node:
- return _fixHtmlDictNames(c_dict, c_doc)
- c_node = c_start_node
- tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
- if c_node.type == tree.XML_ELEMENT_NODE:
- if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
- return -1
- tree.END_FOR_EACH_ELEMENT_FROM(c_node)
- return 0
- cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
- xmlNode* c_node) noexcept nogil:
- cdef xmlNode* c_attr
- c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
- if c_name is NULL:
- return -1
- if c_name is not c_node.name:
- tree.xmlFree(<char*>c_node.name)
- c_node.name = c_name
- c_attr = <xmlNode*>c_node.properties
- while c_attr is not NULL:
- c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
- if c_name is NULL:
- return -1
- if c_name is not c_attr.name:
- tree.xmlFree(<char*>c_attr.name)
- c_attr.name = c_name
- c_attr = c_attr.next
- return 0
- @cython.internal
- cdef class _BaseParser:
- cdef ElementClassLookup _class_lookup
- cdef _ResolverRegistry _resolvers
- cdef _ParserContext _parser_context
- cdef _ParserContext _push_parser_context
- cdef int _parse_options
- cdef bint _for_html
- cdef bint _remove_comments
- cdef bint _remove_pis
- cdef bint _strip_cdata
- cdef bint _collect_ids
- cdef bint _resolve_external_entities
- cdef XMLSchema _schema
- cdef bytes _filename
- cdef readonly object target
- cdef object _default_encoding
- cdef tuple _events_to_collect # (event_types, tag)
- def __init__(self, int parse_options, bint for_html, XMLSchema schema,
- remove_comments, remove_pis, strip_cdata, collect_ids,
- target, encoding, bint resolve_external_entities=True):
- cdef tree.xmlCharEncodingHandler* enchandler
- cdef int c_encoding
- if not isinstance(self, (XMLParser, HTMLParser)):
- raise TypeError, "This class cannot be instantiated"
- if not collect_ids and tree.LIBXML_VERSION >= 21500:
- parse_options |= xmlparser.XML_PARSE_SKIP_IDS
- self._parse_options = parse_options
- self.target = target
- self._for_html = for_html
- self._remove_comments = remove_comments
- self._remove_pis = remove_pis
- self._strip_cdata = strip_cdata
- self._collect_ids = collect_ids
- self._resolve_external_entities = resolve_external_entities
- self._schema = schema
- self._resolvers = _ResolverRegistry()
- if encoding is None:
- self._default_encoding = None
- else:
- encoding = _utf8(encoding)
- enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
- if enchandler is NULL:
- raise LookupError, f"unknown encoding: '{encoding}'"
- tree.xmlCharEncCloseFunc(enchandler)
- self._default_encoding = encoding
- cdef _setBaseURL(self, base_url):
- self._filename = _encodeFilename(base_url)
- cdef _collectEvents(self, event_types, tag):
- if event_types is None:
- event_types = ()
- else:
- event_types = tuple(set(event_types))
- _buildParseEventFilter(event_types) # purely for validation
- self._events_to_collect = (event_types, tag)
- cdef _ParserContext _getParserContext(self):
- cdef xmlparser.xmlParserCtxt* pctxt
- if self._parser_context is None:
- self._parser_context = self._createContext(self.target, None)
- self._parser_context._collect_ids = self._collect_ids
- if self._schema is not None:
- self._parser_context._validator = \
- self._schema._newSaxValidator(
- self._parse_options & xmlparser.XML_PARSE_DTDATTR)
- pctxt = self._newParserCtxt()
- _initParserContext(self._parser_context, self._resolvers, pctxt)
- self._configureSaxContext(pctxt)
- return self._parser_context
- cdef _ParserContext _getPushParserContext(self):
- cdef xmlparser.xmlParserCtxt* pctxt
- if self._push_parser_context is None:
- self._push_parser_context = self._createContext(
- self.target, self._events_to_collect)
- self._push_parser_context._collect_ids = self._collect_ids
- if self._schema is not None:
- self._push_parser_context._validator = \
- self._schema._newSaxValidator(
- self._parse_options & xmlparser.XML_PARSE_DTDATTR)
- pctxt = self._newPushParserCtxt()
- _initParserContext(
- self._push_parser_context, self._resolvers, pctxt)
- self._configureSaxContext(pctxt)
- return self._push_parser_context
- cdef _ParserContext _createContext(self, target, events_to_collect):
- """
- This method creates and configures the lxml-level parser.
- """
- cdef _SaxParserContext sax_context
- if target is not None:
- sax_context = _TargetParserContext(self)
- (<_TargetParserContext>sax_context)._setTarget(target)
- elif events_to_collect:
- sax_context = _SaxParserContext(self)
- else:
- # nothing special to configure
- return _ParserContext()
- if events_to_collect:
- events, tag = events_to_collect
- sax_context._setEventFilter(events, tag)
- return sax_context
- @cython.final
- cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
- if self._remove_comments:
- pctxt.sax.comment = NULL
- if self._remove_pis:
- pctxt.sax.processingInstruction = NULL
- if self._strip_cdata:
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
- if not self._resolve_external_entities:
- pctxt.sax.getEntity = _getInternalEntityOnly
- cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
- cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
- if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
- # need to extend SAX1 context to SAX2 to get proper error reports
- if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
- sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
- if sax is NULL:
- raise MemoryError()
- cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
- sizeof(htmlparser.htmlDefaultSAXHandler))
- c_ctxt.sax = sax
- sax.initialized = xmlparser.XML_SAX2_MAGIC
- # Need a cast here because older libxml2 releases do not use 'const' in the functype.
- sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
- sax.startElementNs = NULL
- sax.endElementNs = NULL
- sax._private = NULL
- return 0
- cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
- """
- Create and initialise a libxml2-level parser context.
- """
- cdef xmlparser.xmlParserCtxt* c_ctxt
- if self._for_html:
- c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
- if c_ctxt is not NULL:
- self._registerHtmlErrorHandler(c_ctxt)
- else:
- c_ctxt = xmlparser.xmlNewParserCtxt()
- if c_ctxt is NULL:
- raise MemoryError
- c_ctxt.sax.startDocument = _initSaxDocument
- return c_ctxt
- cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
- cdef xmlparser.xmlParserCtxt* c_ctxt
- cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
- if self._for_html:
- c_ctxt = htmlparser.htmlCreatePushParserCtxt(
- NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
- if c_ctxt is not NULL:
- self._registerHtmlErrorHandler(c_ctxt)
- htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
- else:
- c_ctxt = xmlparser.xmlCreatePushParserCtxt(
- NULL, NULL, NULL, 0, c_filename)
- if c_ctxt is not NULL:
- xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
- if c_ctxt is NULL:
- raise MemoryError()
- c_ctxt.sax.startDocument = _initSaxDocument
- return c_ctxt
- @property
- def error_log(self):
- """The error log of the last parser run.
- """
- cdef _ParserContext context
- context = self._getParserContext()
- return context._error_log.copy()
- @property
- def resolvers(self):
- """The custom resolver registry of this parser."""
- return self._resolvers
- @property
- def version(self):
- """The version of the underlying XML parser."""
- return "libxml2 %d.%d.%d" % LIBXML_VERSION
- def set_element_class_lookup(self, ElementClassLookup lookup = None):
- """set_element_class_lookup(self, lookup = None)
- Set a lookup scheme for element classes generated from this parser.
- Reset it by passing None or nothing.
- """
- self._class_lookup = lookup
- cdef _BaseParser _copy(self):
- "Create a new parser with the same configuration."
- cdef _BaseParser parser
- parser = self.__class__()
- parser._parse_options = self._parse_options
- parser._for_html = self._for_html
- parser._remove_comments = self._remove_comments
- parser._remove_pis = self._remove_pis
- parser._strip_cdata = self._strip_cdata
- parser._filename = self._filename
- parser._resolvers = self._resolvers
- parser.target = self.target
- parser._class_lookup = self._class_lookup
- parser._default_encoding = self._default_encoding
- parser._schema = self._schema
- parser._events_to_collect = self._events_to_collect
- return parser
- def copy(self):
- """copy(self)
- Create a new parser with the same configuration.
- """
- return self._copy()
- def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
- """makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
- Creates a new element associated with this parser.
- """
- return _makeElement(_tag, NULL, None, self, None, None,
- attrib, nsmap, _extra)
- # internal parser methods
- cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
- """Parse unicode document, share dictionary if possible.
- """
- cdef _ParserContext context
- cdef xmlDoc* result
- cdef xmlparser.xmlParserCtxt* pctxt
- cdef Py_ssize_t py_buffer_len
- cdef int buffer_len, c_kind
- cdef const_char* c_text
- cdef const_char* c_encoding = _PY_UNICODE_ENCODING
- if python.PyUnicode_IS_READY(utext):
- # PEP-393 string
- c_text = <const_char*>python.PyUnicode_DATA(utext)
- py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
- c_kind = python.PyUnicode_KIND(utext)
- if c_kind == 1:
- if python.PyUnicode_MAX_CHAR_VALUE(utext) <= 127:
- c_encoding = 'UTF-8'
- else:
- c_encoding = 'ISO-8859-1'
- elif c_kind == 2:
- py_buffer_len *= 2
- if python.PY_BIG_ENDIAN:
- c_encoding = 'UTF-16BE' # actually UCS-2
- else:
- c_encoding = 'UTF-16LE' # actually UCS-2
- elif c_kind == 4:
- py_buffer_len *= 4
- if python.PY_BIG_ENDIAN:
- c_encoding = 'UTF-32BE' # actually UCS-4
- else:
- c_encoding = 'UTF-32LE' # actually UCS-4
- else:
- assert False, f"Illegal Unicode kind {c_kind}"
- else:
- # old Py_UNICODE string
- py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
- c_text = python.PyUnicode_AS_DATA(utext)
- assert 0 <= py_buffer_len <= limits.INT_MAX
- buffer_len = py_buffer_len
- context = self._getParserContext()
- context.prepare()
- try:
- pctxt = context._c_ctxt
- __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- orig_options = pctxt.options
- with nogil:
- if self._for_html:
- result = htmlparser.htmlCtxtReadMemory(
- pctxt, c_text, buffer_len, c_filename, c_encoding,
- self._parse_options)
- if result is not NULL:
- if _fixHtmlDictNames(pctxt.dict, result) < 0:
- tree.xmlFreeDoc(result)
- result = NULL
- else:
- result = xmlparser.xmlCtxtReadMemory(
- pctxt, c_text, buffer_len, c_filename, c_encoding,
- self._parse_options)
- pctxt.options = orig_options # work around libxml2 problem
- return context._handleParseResultDoc(self, result, None)
- finally:
- context.cleanup()
- cdef xmlDoc* _parseDoc(self, const char* c_text, int c_len, char* c_filename) except NULL:
- """Parse document, share dictionary if possible.
- """
- cdef _ParserContext context
- cdef xmlDoc* result
- cdef xmlparser.xmlParserCtxt* pctxt
- cdef char* c_encoding
- cdef tree.xmlCharEncoding enc
- context = self._getParserContext()
- context.prepare()
- try:
- pctxt = context._c_ctxt
- __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- if self._default_encoding is None:
- c_encoding = NULL
- # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
- # NOTE: limit to problematic cases because it changes character offsets
- if c_len >= 4 and (c_text[0] == b'\xFF' and c_text[1] == b'\xFE' and
- c_text[2] == 0 and c_text[3] == 0):
- c_encoding = "UTF-32LE"
- c_text += 4
- c_len -= 4
- elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
- c_text[2] == b'\xFE' and c_text[3] == b'\xFF'):
- c_encoding = "UTF-32BE"
- c_text += 4
- c_len -= 4
- else:
- # no BOM => try to determine encoding
- enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
- if enc == tree.XML_CHAR_ENCODING_UCS4LE:
- c_encoding = 'UTF-32LE'
- elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
- c_encoding = 'UTF-32BE'
- else:
- c_encoding = _cstr(self._default_encoding)
- orig_options = pctxt.options
- with nogil:
- if self._for_html:
- result = htmlparser.htmlCtxtReadMemory(
- pctxt, c_text, c_len, c_filename,
- c_encoding, self._parse_options)
- if result is not NULL:
- if _fixHtmlDictNames(pctxt.dict, result) < 0:
- tree.xmlFreeDoc(result)
- result = NULL
- else:
- result = xmlparser.xmlCtxtReadMemory(
- pctxt, c_text, c_len, c_filename,
- c_encoding, self._parse_options)
- pctxt.options = orig_options # work around libxml2 problem
- return context._handleParseResultDoc(self, result, None)
- finally:
- context.cleanup()
- cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
- cdef _ParserContext context
- cdef xmlDoc* result
- cdef xmlparser.xmlParserCtxt* pctxt
- cdef char* c_encoding
- result = NULL
- context = self._getParserContext()
- context.prepare()
- try:
- pctxt = context._c_ctxt
- __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- if self._default_encoding is None:
- c_encoding = NULL
- else:
- c_encoding = _cstr(self._default_encoding)
- orig_options = pctxt.options
- with nogil:
- if self._for_html:
- result = htmlparser.htmlCtxtReadFile(
- pctxt, c_filename, c_encoding, self._parse_options)
- if result is not NULL:
- if _fixHtmlDictNames(pctxt.dict, result) < 0:
- tree.xmlFreeDoc(result)
- result = NULL
- else:
- result = xmlparser.xmlCtxtReadFile(
- pctxt, c_filename, c_encoding, self._parse_options)
- pctxt.options = orig_options # work around libxml2 problem
- return context._handleParseResultDoc(self, result, c_filename)
- finally:
- context.cleanup()
- cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
- encoding) except NULL:
- cdef _ParserContext context
- cdef _FileReaderContext file_context
- cdef xmlDoc* result
- cdef xmlparser.xmlParserCtxt* pctxt
- cdef char* c_filename
- if not filename:
- filename = None
- context = self._getParserContext()
- context.prepare()
- try:
- pctxt = context._c_ctxt
- __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- file_context = _FileReaderContext(
- filelike, context, filename,
- encoding or self._default_encoding)
- result = file_context._readDoc(pctxt, self._parse_options)
- return context._handleParseResultDoc(
- self, result, filename)
- finally:
- context.cleanup()
- cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name) noexcept nogil:
- """
- Callback function to intercept the entity resolution when external entity loading is disabled.
- """
- cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
- if not entity:
- return NULL
- if entity.etype not in (
- tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
- tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
- tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
- return entity
- # Reject all external entities and fail the parsing instead. There is currently
- # no way in libxml2 to just prevent the entity resolution in this case.
- cdef xmlerror.xmlError c_error
- cdef xmlerror.xmlStructuredErrorFunc err_func
- cdef xmlparser.xmlParserInput* parser_input
- cdef void* err_context
- c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
- err_func = xmlerror.xmlStructuredError
- if err_func:
- parser_input = c_ctxt.input
- # Copied from xmlVErrParser() in libxml2: get current input from stack.
- if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
- parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
- c_error = xmlerror.xmlError(
- domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
- code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
- level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
- message=b"External entity resolution is disabled for security reasons "
- b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
- b"if you consider it safe to enable it.",
- file=parser_input.filename,
- node=entity,
- str1=<char*> name,
- str2=NULL,
- str3=NULL,
- line=parser_input.line if parser_input else 0,
- int1=0,
- int2=parser_input.col if parser_input else 0,
- )
- err_context = xmlerror.xmlStructuredErrorContext
- err_func(err_context, &c_error)
- c_ctxt.wellFormed = 0
- # The entity was looked up and does not need to be freed.
- return NULL
- cdef void _initSaxDocument(void* ctxt) noexcept with gil:
- xmlparser.xmlSAX2StartDocument(ctxt)
- c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
- c_doc = c_ctxt.myDoc
- # set up document dict
- if c_doc and c_ctxt.dict and not c_doc.dict:
- # I have no idea why libxml2 disables this - we need it
- c_ctxt.dictNames = 1
- c_doc.dict = c_ctxt.dict
- xmlparser.xmlDictReference(c_ctxt.dict)
- # set up XML ID hash table
- if c_ctxt._private:
- context = <_ParserContext>c_ctxt._private
- if context._collect_ids:
- # keep the global parser dict from filling up with XML IDs
- if c_doc and not c_doc.ids:
- # memory errors are not fatal here
- c_dict = xmlparser.xmlDictCreate()
- if c_dict:
- c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
- xmlparser.xmlDictFree(c_dict)
- else:
- c_doc.ids = tree.xmlHashCreate(0)
- else:
- c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
- if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
- # already initialised but empty => clear
- tree.xmlHashFree(c_doc.ids, NULL)
- c_doc.ids = NULL
- ############################################################
- ## ET feed parser
- ############################################################
- cdef class _FeedParser(_BaseParser):
- cdef bint _feed_parser_running
- @property
- def feed_error_log(self):
- """The error log of the last (or current) run of the feed parser.
- Note that this is local to the feed parser and thus is
- different from what the ``error_log`` property returns.
- """
- return self._getPushParserContext()._error_log.copy()
- cpdef feed(self, data):
- """feed(self, data)
- Feeds data to the parser. The argument should be an 8-bit string
- buffer containing encoded data, although Unicode is supported as long
- as both string types are not mixed.
- This is the main entry point to the consumer interface of a
- parser. The parser will parse as much of the XML stream as it
- can on each call. To finish parsing or to reset the parser,
- call the ``close()`` method. Both methods may raise
- ParseError if errors occur in the input data. If an error is
- raised, there is no longer a need to call ``close()``.
- The feed parser interface is independent of the normal parser
- usage. You can use the same parser as a feed parser and in
- the ``parse()`` function concurrently.
- """
- cdef _ParserContext context
- cdef bytes bstring
- cdef xmlparser.xmlParserCtxt* pctxt
- cdef Py_ssize_t py_buffer_len, ustart
- cdef const_char* char_data
- cdef const_char* c_encoding
- cdef int buffer_len
- cdef int error
- cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- if isinstance(data, bytes):
- if self._default_encoding is None:
- c_encoding = NULL
- else:
- c_encoding = self._default_encoding
- char_data = _cstr(data)
- py_buffer_len = python.PyBytes_GET_SIZE(data)
- ustart = 0
- elif isinstance(data, unicode):
- c_encoding = b"UTF-8"
- char_data = NULL
- py_buffer_len = len(<unicode> data)
- ustart = 0
- else:
- raise TypeError, "Parsing requires string data"
- context = self._getPushParserContext()
- pctxt = context._c_ctxt
- error = 0
- if not self._feed_parser_running:
- context.prepare(set_document_loader=False)
- self._feed_parser_running = 1
- c_filename = (_cstr(self._filename)
- if self._filename is not None else NULL)
- # We have to give *mlCtxtResetPush() enough input to figure
- # out the character encoding (at least four bytes),
- # however if we give it all we got, we'll have nothing for
- # *mlParseChunk() and things go wrong.
- buffer_len = 0
- if char_data is not NULL:
- buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
- orig_loader = _register_document_loader()
- if self._for_html:
- error = _htmlCtxtResetPush(
- pctxt, char_data, buffer_len, c_filename, c_encoding,
- self._parse_options)
- else:
- xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
- error = xmlparser.xmlCtxtResetPush(
- pctxt, char_data, buffer_len, c_filename, c_encoding)
- _reset_document_loader(orig_loader)
- py_buffer_len -= buffer_len
- char_data += buffer_len
- if error:
- raise MemoryError()
- __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
- fixup_error = 0
- while py_buffer_len > 0 and (error == 0 or recover):
- if char_data is NULL:
- # Unicode parsing by converting chunks to UTF-8
- buffer_len = 2**19 # len(bytes) <= 4 * (2**19) == 2 MiB
- bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8')
- ustart += buffer_len
- py_buffer_len -= buffer_len # may end up < 0
- error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring))
- else:
- # Direct byte string parsing.
- buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX
- error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len)
- py_buffer_len -= buffer_len
- char_data += buffer_len
- if fixup_error:
- context.store_exception(MemoryError())
- if context._has_raised():
- # propagate Python exceptions immediately
- recover = 0
- error = 1
- break
- if error and not pctxt.replaceEntities and not pctxt.validate:
- # in this mode, we ignore errors about undefined entities
- for entry in context._error_log.filter_from_errors():
- if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
- entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
- break
- else:
- error = 0
- if not pctxt.wellFormed and xmlparser.xmlCtxtIsStopped(pctxt) and context._has_raised():
- # propagate Python exceptions immediately
- recover = 0
- error = 1
- if fixup_error or not recover and (error or not pctxt.wellFormed):
- self._feed_parser_running = 0
- try:
- context._handleParseResult(self, pctxt.myDoc, None)
- finally:
- context.cleanup()
- cpdef close(self):
- """close(self)
- Terminates feeding data to this parser. This tells the parser to
- process any remaining data in the feed buffer, and then returns the
- root Element of the tree that was parsed.
- This method must be called after passing the last chunk of data into
- the ``feed()`` method. It should only be called when using the feed
- parser interface, all other usage is undefined.
- """
- if not self._feed_parser_running:
- raise XMLSyntaxError("no element found",
- xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
- self._filename)
- context = self._getPushParserContext()
- pctxt = context._c_ctxt
- self._feed_parser_running = 0
- if self._for_html:
- htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
- else:
- xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
- if (pctxt.recovery and not xmlparser.xmlCtxtIsStopped(pctxt) and
- isinstance(context, _SaxParserContext)):
- # apply any left-over 'end' events
- (<_SaxParserContext>context).flushEvents()
- try:
- result = context._handleParseResult(self, pctxt.myDoc, None)
- finally:
- context.cleanup()
- if isinstance(result, _Document):
- return (<_Document>result).getroot()
- else:
- return result
- cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt,
- const char* char_data, int buffer_len):
- fixup_error = 0
- with nogil:
- if c_ctxt.html:
- c_node = c_ctxt.node # last node where the parser stopped
- orig_loader = _register_document_loader()
- error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0)
- _reset_document_loader(orig_loader)
- # and now for the fun part: move node names to the dict
- if c_ctxt.myDoc:
- fixup_error = _fixHtmlDictSubtreeNames(
- c_ctxt.dict, c_ctxt.myDoc, c_node)
- if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict:
- xmlparser.xmlDictFree(c_ctxt.myDoc.dict)
- c_ctxt.myDoc.dict = c_ctxt.dict
- xmlparser.xmlDictReference(c_ctxt.dict)
- else:
- orig_loader = _register_document_loader()
- error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0)
- _reset_document_loader(orig_loader)
- return (error, fixup_error)
- cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
- const_char* c_data, int buffer_len,
- const_char* c_filename, const_char* c_encoding,
- int parse_options) except -1:
- cdef xmlparser.xmlParserInput* c_input_stream
- # libxml2 lacks an HTML push parser setup function
- error = xmlparser.xmlCtxtResetPush(
- c_ctxt, c_data, buffer_len, c_filename, c_encoding)
- if error:
- return error
- # fix libxml2 setup for HTML
- if tree.LIBXML_VERSION < 21400:
- c_ctxt.progressive = 1 # TODO: remove
- c_ctxt.html = 1
- htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
- return 0
- ############################################################
- ## XML parser
- ############################################################
- cdef int _XML_DEFAULT_PARSE_OPTIONS
- _XML_DEFAULT_PARSE_OPTIONS = (
- xmlparser.XML_PARSE_NOENT |
- xmlparser.XML_PARSE_NOCDATA |
- xmlparser.XML_PARSE_NONET |
- xmlparser.XML_PARSE_COMPACT |
- xmlparser.XML_PARSE_BIG_LINES
- )
- cdef class XMLParser(_FeedParser):
- """XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, \
- load_dtd=False, no_network=True, decompress=False, ns_clean=False, \
- recover=False, schema: XMLSchema =None, huge_tree=False, \
- remove_blank_text=False, resolve_entities=True, \
- remove_comments=False, remove_pis=False, strip_cdata=True, \
- collect_ids=True, target=None, compact=True)
- The XML parser.
- Parsers can be supplied as additional argument to various parse
- functions of the lxml API. A default parser is always available
- and can be replaced by a call to the global function
- 'set_default_parser'. New parsers can be created at any time
- without a major run-time overhead.
- The keyword arguments in the constructor are mainly based on the
- libxml2 parser configuration. A DTD will also be loaded if DTD
- validation or attribute default values are requested (unless you
- additionally provide an XMLSchema from which the default
- attributes can be read).
- Available boolean keyword arguments:
- - attribute_defaults - inject default attributes from DTD or XMLSchema
- - dtd_validation - validate against a DTD referenced by the document
- - load_dtd - use DTD for parsing
- - no_network - prevent network access for related files (default: True)
- - decompress - automatically decompress gzip input
- (default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
- - ns_clean - clean up redundant namespace declarations
- - recover - try hard to parse through broken XML
- - remove_blank_text - discard blank text nodes that appear ignorable
- - remove_comments - discard comments
- - remove_pis - discard processing instructions
- - strip_cdata - replace CDATA sections by normal text content (default: True)
- - compact - save memory for short text content (default: True)
- - collect_ids - use a hash table of XML IDs for fast access
- (default: True, always True with DTD validation)
- - huge_tree - disable security restrictions and support very deep trees
- and very long text content
- Other keyword arguments:
- - resolve_entities - replace entities by their text value: False for keeping the
- entity references, True for resolving them, and 'internal' for resolving
- internal definitions only (no external file/URL access).
- The default used to be True and was changed to 'internal' in lxml 5.0.
- - encoding - override the document encoding (note: libiconv encoding name)
- - target - a parser target object that will receive the parse events
- - schema - an XMLSchema to validate against
- Note that you should avoid sharing parsers between threads. While this is
- not harmful, it is more efficient to use separate parsers. This does not
- apply to the default parser.
- """
- def __init__(self, *, encoding=None, attribute_defaults=False,
- dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
- ns_clean=False, recover=False, XMLSchema schema=None,
- huge_tree=False, remove_blank_text=False, resolve_entities='internal',
- remove_comments=False, remove_pis=False, strip_cdata=True,
- collect_ids=True, target=None, compact=True):
- cdef int parse_options
- cdef bint resolve_external = True
- parse_options = _XML_DEFAULT_PARSE_OPTIONS
- if load_dtd:
- parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
- if dtd_validation:
- parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
- xmlparser.XML_PARSE_DTDLOAD
- if attribute_defaults:
- parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
- if schema is None:
- parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
- if ns_clean:
- parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
- if recover:
- parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
- if remove_blank_text:
- parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
- if huge_tree:
- parse_options = parse_options | xmlparser.XML_PARSE_HUGE
- if not no_network:
- parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
- if not compact:
- parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
- if not resolve_entities:
- parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
- elif resolve_entities == 'internal':
- resolve_external = False
- if not strip_cdata:
- parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
- if decompress:
- parse_options |= xmlparser.XML_PARSE_UNZIP
- _BaseParser.__init__(self, parse_options, False, schema,
- remove_comments, remove_pis, strip_cdata,
- collect_ids, target, encoding, resolve_external)
- # Allow subscripting XMLParser in type annotions (PEP 560)
- def __class_getitem__(cls, item):
- return _GenericAlias(cls, item)
- cdef class XMLPullParser(XMLParser):
- """XMLPullParser(self, events=None, *, tag=None, **kwargs)
- XML parser that collects parse events in an iterator.
- The collected events are the same as for iterparse(), but the
- parser itself is non-blocking in the sense that it receives
- data chunks incrementally through its .feed() method, instead
- of reading them directly from a file(-like) object all by itself.
- By default, it collects Element end events. To change that,
- pass any subset of the available events into the ``events``
- argument: ``'start'``, ``'end'``, ``'start-ns'``,
- ``'end-ns'``, ``'comment'``, ``'pi'``.
- To support loading external dependencies relative to the input
- source, you can pass the ``base_url``.
- """
- def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
- XMLParser.__init__(self, **kwargs)
- if events is None:
- events = ('end',)
- self._setBaseURL(base_url)
- self._collectEvents(events, tag)
- def read_events(self):
- return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
- cdef class ETCompatXMLParser(XMLParser):
- """ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
- dtd_validation=False, load_dtd=False, no_network=True, decompress=False, \
- ns_clean=False, recover=False, schema=None, \
- huge_tree=False, remove_blank_text=False, resolve_entities=True, \
- remove_comments=True, remove_pis=True, strip_cdata=True, \
- target=None, compact=True)
- An XML parser with an ElementTree compatible default setup.
- See the XMLParser class for details.
- This parser has ``remove_comments`` and ``remove_pis`` enabled by default
- and thus ignores comments and processing instructions.
- """
- def __init__(self, *, encoding=None, attribute_defaults=False,
- dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
- ns_clean=False, recover=False, schema=None,
- huge_tree=False, remove_blank_text=False, resolve_entities=True,
- remove_comments=True, remove_pis=True, strip_cdata=True,
- target=None, compact=True):
- XMLParser.__init__(self,
- attribute_defaults=attribute_defaults,
- dtd_validation=dtd_validation,
- load_dtd=load_dtd,
- no_network=no_network,
- decompress=decompress,
- ns_clean=ns_clean,
- recover=recover,
- remove_blank_text=remove_blank_text,
- huge_tree=huge_tree,
- compact=compact,
- resolve_entities=resolve_entities,
- remove_comments=remove_comments,
- remove_pis=remove_pis,
- strip_cdata=strip_cdata,
- target=target,
- encoding=encoding,
- schema=schema,
- )
- # ET 1.2 compatible name
- XMLTreeBuilder = ETCompatXMLParser
- cdef XMLParser __DEFAULT_XML_PARSER
- __DEFAULT_XML_PARSER = XMLParser()
- __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
- def set_default_parser(_BaseParser parser=None):
- """set_default_parser(parser=None)
- Set a default parser for the current thread. This parser is used
- globally whenever no parser is supplied to the various parse functions of
- the lxml API. If this function is called without a parser (or if it is
- None), the default parser is reset to the original configuration.
- Note that the pre-installed default parser is not thread-safe. Avoid the
- default parser in multi-threaded environments. You can create a separate
- parser for each thread explicitly or use a parser pool.
- """
- if parser is None:
- parser = __DEFAULT_XML_PARSER
- __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
- def get_default_parser():
- "get_default_parser()"
- return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- ############################################################
- ## HTML parser
- ############################################################
- cdef int _HTML_DEFAULT_PARSE_OPTIONS
- _HTML_DEFAULT_PARSE_OPTIONS = (
- htmlparser.HTML_PARSE_RECOVER |
- htmlparser.HTML_PARSE_NONET |
- htmlparser.HTML_PARSE_COMPACT
- )
- cdef object _UNUSED = object()
- cdef class HTMLParser(_FeedParser):
- """HTMLParser(self, encoding=None, remove_blank_text=False, \
- remove_comments=False, remove_pis=False, \
- no_network=True, decompress=False, target=None, schema: XMLSchema =None, \
- recover=True, compact=True, collect_ids=True, huge_tree=False)
- The HTML parser.
- This parser allows reading HTML into a normal XML tree. By
- default, it can read broken (non well-formed) HTML, depending on
- the capabilities of libxml2. Use the 'recover' option to switch
- this off.
- Available boolean keyword arguments:
- - recover - try hard to parse through broken HTML (default: True)
- - no_network - prevent network access for related files (default: True)
- - decompress - automatically decompress gzip input
- (default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
- - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
- - remove_comments - discard comments
- - remove_pis - discard processing instructions
- - compact - save memory for short text content (default: True)
- - default_doctype - add a default doctype even if it is not found in the HTML (default: True)
- - collect_ids - use a hash table of XML IDs for fast access (default: True)
- - huge_tree - disable security restrictions and support very deep trees
- and very long text content
- Other keyword arguments:
- - encoding - override the document encoding (note: libiconv encoding name)
- - target - a parser target object that will receive the parse events
- - schema - an XMLSchema to validate against
- Note that you should avoid sharing parsers between threads for performance
- reasons.
- """
- def __init__(self, *, encoding=None, remove_blank_text=False,
- remove_comments=False, remove_pis=False, strip_cdata=_UNUSED,
- no_network=True, decompress=False, target=None, XMLSchema schema=None,
- recover=True, compact=True, default_doctype=True,
- collect_ids=True, huge_tree=False):
- cdef int parse_options
- parse_options = _HTML_DEFAULT_PARSE_OPTIONS
- if remove_blank_text:
- parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
- if not recover:
- parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
- if not no_network:
- parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
- if not compact:
- parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
- if not default_doctype:
- parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
- if huge_tree:
- parse_options = parse_options | xmlparser.XML_PARSE_HUGE
- if decompress:
- parse_options |= xmlparser.XML_PARSE_UNZIP
- if strip_cdata is not _UNUSED:
- import warnings
- warnings.warn(
- "The 'strip_cdata' option of HTMLParser() has never done anything and will eventually be removed.",
- DeprecationWarning)
- _BaseParser.__init__(self, parse_options, True, schema,
- remove_comments, remove_pis, strip_cdata,
- collect_ids, target, encoding)
- # Allow subscripting HTMLParser in type annotions (PEP 560)
- def __class_getitem__(cls, item):
- return _GenericAlias(cls, item)
- cdef HTMLParser __DEFAULT_HTML_PARSER
- __DEFAULT_HTML_PARSER = HTMLParser()
- cdef class HTMLPullParser(HTMLParser):
- """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
- HTML parser that collects parse events in an iterator.
- The collected events are the same as for iterparse(), but the
- parser itself is non-blocking in the sense that it receives
- data chunks incrementally through its .feed() method, instead
- of reading them directly from a file(-like) object all by itself.
- By default, it collects Element end events. To change that,
- pass any subset of the available events into the ``events``
- argument: ``'start'``, ``'end'``, ``'start-ns'``,
- ``'end-ns'``, ``'comment'``, ``'pi'``.
- To support loading external dependencies relative to the input
- source, you can pass the ``base_url``.
- """
- def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
- HTMLParser.__init__(self, **kwargs)
- if events is None:
- events = ('end',)
- self._setBaseURL(base_url)
- self._collectEvents(events, tag)
- def read_events(self):
- return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
- ############################################################
- ## helper functions for document creation
- ############################################################
- cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
- cdef char* c_filename
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- if not filename:
- c_filename = NULL
- else:
- filename_utf = _encodeFilenameUTF8(filename)
- c_filename = _cstr(filename_utf)
- if isinstance(text, bytes):
- return _parseDoc_bytes(<bytes> text, filename, c_filename, parser)
- elif isinstance(text, unicode):
- return _parseDoc_unicode(<unicode> text, filename, c_filename, parser)
- else:
- return _parseDoc_charbuffer(text, filename, c_filename, parser)
- cdef xmlDoc* _parseDoc_unicode(unicode text, filename, char* c_filename, _BaseParser parser) except NULL:
- cdef Py_ssize_t c_len
- if python.PyUnicode_IS_READY(text):
- # PEP-393 Unicode string
- c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
- else:
- # old Py_UNICODE string
- c_len = python.PyUnicode_GET_DATA_SIZE(text)
- if c_len > limits.INT_MAX:
- return parser._parseDocFromFilelike(
- StringIO(text), filename, None)
- return parser._parseUnicodeDoc(text, c_filename)
- cdef xmlDoc* _parseDoc_bytes(bytes text, filename, char* c_filename, _BaseParser parser) except NULL:
- cdef Py_ssize_t c_len = len(text)
- if c_len > limits.INT_MAX:
- return parser._parseDocFromFilelike(BytesIO(text), filename, None)
- return parser._parseDoc(text, c_len, c_filename)
- cdef xmlDoc* _parseDoc_charbuffer(text, filename, char* c_filename, _BaseParser parser) except NULL:
- cdef const unsigned char[::1] data = memoryview(text).cast('B') # cast to 'unsigned char' buffer
- cdef Py_ssize_t c_len = len(data)
- if c_len > limits.INT_MAX:
- return parser._parseDocFromFilelike(BytesIO(text), filename, None)
- return parser._parseDoc(<const char*>&data[0], c_len, c_filename)
- cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
- cdef xmlDoc* _parseDocFromFilelike(source, filename,
- _BaseParser parser) except NULL:
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
- cdef xmlDoc* _newXMLDoc() except NULL:
- cdef xmlDoc* result
- result = tree.xmlNewDoc(NULL)
- if result is NULL:
- raise MemoryError()
- if result.encoding is NULL:
- result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
- return result
- cdef xmlDoc* _newHTMLDoc() except NULL:
- cdef xmlDoc* result
- result = tree.htmlNewDoc(NULL, NULL)
- if result is NULL:
- raise MemoryError()
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
- return result
- cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
- cdef xmlDoc* result
- if recursive:
- with nogil:
- result = tree.xmlCopyDoc(c_doc, recursive)
- else:
- result = tree.xmlCopyDoc(c_doc, 0)
- if result is NULL:
- raise MemoryError()
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
- return result
- cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
- "Recursively copy the document and make c_new_root the new root node."
- cdef xmlDoc* result
- cdef xmlNode* c_node
- result = tree.xmlCopyDoc(c_doc, 0) # non recursive
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
- with nogil:
- c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
- if c_node is NULL:
- raise MemoryError()
- tree.xmlDocSetRootElement(result, c_node)
- _copyTail(c_new_root.next, c_node)
- return result
- cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
- "Recursively copy the element into the document. c_doc is not modified."
- cdef xmlNode* c_root
- c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
- if c_root is NULL:
- raise MemoryError()
- _copyTail(c_node.next, c_root)
- return c_root
- ############################################################
- ## API level helper functions for _Document creation
- ############################################################
- cdef _Document _parseDocument(source, _BaseParser parser, base_url):
- cdef _Document doc
- source = _getFSPathOrObject(source)
- if _isString(source):
- # parse the file directly from the filesystem
- doc = _parseDocumentFromURL(_encodeFilename(source), parser)
- # fix base URL if requested
- if base_url is not None:
- base_url = _encodeFilenameUTF8(base_url)
- if doc._c_doc.URL is not NULL:
- tree.xmlFree(<char*>doc._c_doc.URL)
- doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
- return doc
- if base_url is not None:
- url = base_url
- else:
- url = _getFilenameForFile(source)
- if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
- # StringIO - reading from start?
- if source.tell() == 0:
- return _parseMemoryDocument(source.getvalue(), url, parser)
- # Support for file-like objects (urlgrabber.urlopen, ...)
- if hasattr(source, 'read'):
- return _parseFilelikeDocument(source, url, parser)
- raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
- cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
- c_doc = _parseDocFromFile(url, parser)
- return _documentFactory(c_doc, parser)
- cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
- if isinstance(text, unicode):
- if _hasEncodingDeclaration(text):
- raise ValueError(
- "Unicode strings with encoding declaration are not supported. "
- "Please use bytes input or XML fragments without declaration.")
- c_doc = _parseDoc(text, url, parser)
- return _documentFactory(c_doc, parser)
- cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
- c_doc = _parseDocFromFilelike(source, url, parser)
- return _documentFactory(c_doc, parser)
|