| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853 |
- # cython: binding=True
- # cython: auto_pickle=False
- # cython: language_level=3
- """
- The ``lxml.etree`` module implements the extended ElementTree API for XML.
- """
- __docformat__ = "restructuredtext en"
- __all__ = [
- 'AttributeBasedElementClassLookup', 'C14NError', 'C14NWriterTarget', 'CDATA',
- 'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG',
- 'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError',
- 'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element',
- 'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup',
- 'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase',
- 'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension',
- 'FallbackElementClassLookup', 'FunctionNamespace', 'HTML', 'HTMLParser',
- 'ICONV_COMPILED_VERSION',
- 'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION',
- 'LIBXML_FEATURES',
- 'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION',
- 'LXML_VERSION',
- 'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError',
- 'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError',
- 'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction',
- 'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG',
- 'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError',
- 'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError',
- 'SchematronParseError', 'SchematronValidateError', 'SerialisationError',
- 'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML',
- 'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError',
- 'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError',
- 'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError',
- 'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError',
- 'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError',
- 'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError',
- 'XSLTSaveError', 'canonicalize',
- 'cleanup_namespaces', 'clear_error_log', 'dump',
- 'fromstring', 'fromstringlist', 'get_default_parser', 'iselement',
- 'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace',
- 'set_default_parser', 'set_element_class_lookup', 'strip_attributes',
- 'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode',
- 'use_global_python_log'
- ]
- cimport cython
- from lxml cimport python
- from lxml.includes cimport tree, config
- from lxml.includes.tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs
- from lxml.includes.tree cimport const_xmlChar, xmlChar, _xcstr
- from lxml.python cimport _cstr, _isString
- from lxml.includes cimport xpath
- from lxml.includes cimport c14n
- # Cython's standard declarations
- cimport cpython.mem
- cimport cpython.ref
- from libc cimport limits, stdio, stdlib
- from libc cimport string as cstring_h # not to be confused with stdlib 'string'
- from libc.string cimport const_char
- cdef object os_path_abspath
- from os.path import abspath as os_path_abspath
- cdef object BytesIO, StringIO
- from io import BytesIO, StringIO
- cdef object OrderedDict
- from collections import OrderedDict
- cdef object _elementpath
- from lxml import _elementpath
- cdef object sys
- import sys
- cdef object re
- import re
- cdef object partial
- from functools import partial
- cdef object islice
- from itertools import islice
- cdef object ITER_EMPTY = iter(())
- cdef object MutableMapping
- from collections.abc import MutableMapping
- class _ImmutableMapping(MutableMapping):
- def __getitem__(self, key):
- raise KeyError, key
- def __setitem__(self, key, value):
- raise KeyError, key
- def __delitem__(self, key):
- raise KeyError, key
- def __contains__(self, key):
- return False
- def __len__(self):
- return 0
- def __iter__(self):
- return ITER_EMPTY
- iterkeys = itervalues = iteritems = __iter__
- cdef object IMMUTABLE_EMPTY_MAPPING = _ImmutableMapping()
- del _ImmutableMapping
- # the rules
- # ---------
- # any libxml C argument/variable is prefixed with c_
- # any non-public function/class is prefixed with an underscore
- # instance creation is always through factories
- # what to do with libxml2/libxslt error messages?
- # 0 : drop
- # 1 : use log
- DEF __DEBUG = 1
- # maximum number of lines in the libxml2/xslt log if __DEBUG == 1
- DEF __MAX_LOG_SIZE = 100
- # make the compiled-in debug state publicly available
- DEBUG = __DEBUG
- # A struct to store a cached qualified tag name+href pair.
- # While we can borrow the c_name from the document dict,
- # PyPy requires us to store a Python reference for the
- # namespace in order to keep the byte buffer alive.
- cdef struct qname:
- const_xmlChar* c_name
- python.PyObject* href
- # initialize parser (and threading)
- xmlparser.xmlInitParser()
- # global per-thread setup
- tree.xmlThrDefIndentTreeOutput(1)
- tree.xmlThrDefLineNumbersDefaultValue(1)
- _initThreadLogging()
- # filename encoding
- cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")
- cdef char* _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
- # set up some default namespace prefixes
- cdef dict _DEFAULT_NAMESPACE_PREFIXES = {
- b"http://www.w3.org/XML/1998/namespace": b'xml',
- b"http://www.w3.org/1999/xhtml": b"html",
- b"http://www.w3.org/1999/XSL/Transform": b"xsl",
- b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf",
- b"http://schemas.xmlsoap.org/wsdl/": b"wsdl",
- # xml schema
- b"http://www.w3.org/2001/XMLSchema": b"xs",
- b"http://www.w3.org/2001/XMLSchema-instance": b"xsi",
- # dublin core
- b"http://purl.org/dc/elements/1.1/": b"dc",
- # objectify
- b"http://codespeak.net/lxml/objectify/pytype" : b"py",
- }
- # To avoid runtime encoding overhead, we keep a Unicode copy
- # of the uri-prefix mapping as (str, str) items view.
- cdef object _DEFAULT_NAMESPACE_PREFIXES_ITEMS = []
- cdef _update_default_namespace_prefixes_items():
- cdef bytes ns, prefix
- global _DEFAULT_NAMESPACE_PREFIXES_ITEMS
- _DEFAULT_NAMESPACE_PREFIXES_ITEMS = {
- ns.decode('utf-8') : prefix.decode('utf-8')
- for ns, prefix in _DEFAULT_NAMESPACE_PREFIXES.items()
- }.items()
- _update_default_namespace_prefixes_items()
- cdef object _check_internal_prefix = re.compile(br"ns\d+$").match
- def register_namespace(prefix, uri):
- """Registers a namespace prefix that newly created Elements in that
- namespace will use. The registry is global, and any existing
- mapping for either the given prefix or the namespace URI will be
- removed.
- """
- prefix_utf, uri_utf = _utf8(prefix), _utf8(uri)
- if _check_internal_prefix(prefix_utf):
- raise ValueError("Prefix format reserved for internal use")
- _tagValidOrRaise(prefix_utf)
- _uriValidOrRaise(uri_utf)
- if (uri_utf == b"http://www.w3.org/XML/1998/namespace" and prefix_utf != b'xml'
- or prefix_utf == b'xml' and uri_utf != b"http://www.w3.org/XML/1998/namespace"):
- raise ValueError("Cannot change the 'xml' prefix of the XML namespace")
- for k, v in list(_DEFAULT_NAMESPACE_PREFIXES.items()):
- if k == uri_utf or v == prefix_utf:
- del _DEFAULT_NAMESPACE_PREFIXES[k]
- _DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf
- _update_default_namespace_prefixes_items()
- # Error superclass for ElementTree compatibility
- cdef class Error(Exception):
- pass
- # module level superclass for all exceptions
- cdef class LxmlError(Error):
- """Main exception base class for lxml. All other exceptions inherit from
- this one.
- """
- def __init__(self, message, error_log=None):
- super(_Error, self).__init__(message)
- if error_log is None:
- self.error_log = __copyGlobalErrorLog()
- else:
- self.error_log = error_log.copy()
- cdef object _Error = Error
- # superclass for all syntax errors
- class LxmlSyntaxError(LxmlError, SyntaxError):
- """Base class for all syntax errors.
- """
- cdef class C14NError(LxmlError):
- """Error during C14N serialisation.
- """
- # version information
- cdef tuple __unpackDottedVersion(version):
- version_list = []
- l = (version.decode("ascii").replace('-', '.').split('.') + [0]*4)[:4]
- for item in l:
- try:
- item = int(item)
- except ValueError:
- if item.startswith('dev'):
- count = item[3:]
- item = -300
- elif item.startswith('alpha'):
- count = item[5:]
- item = -200
- elif item.startswith('beta'):
- count = item[4:]
- item = -100
- else:
- count = 0
- if count:
- item += int(count)
- version_list.append(item)
- return tuple(version_list)
- cdef tuple __unpackIntVersion(int c_version, int base=100):
- return (
- ((c_version // (base*base)) % base),
- ((c_version // base) % base),
- (c_version % base)
- )
- cdef int _LIBXML_VERSION_INT
- try:
- _LIBXML_VERSION_INT = int(
- re.match('[0-9]+', (<unsigned char*>tree.xmlParserVersion).decode("ascii")).group(0))
- except Exception:
- print("Unknown libxml2 version: " + (<unsigned char*>tree.xmlParserVersion).decode("latin1"))
- _LIBXML_VERSION_INT = 0
- LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT)
- LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION)
- LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING)
- __version__ = tree.LXML_VERSION_STRING.decode("ascii")
- cdef extern from *:
- """
- #ifdef ZLIB_VERNUM
- #define __lxml_zlib_version (ZLIB_VERNUM >> 4)
- #else
- #define __lxml_zlib_version 0
- #endif
- #ifdef _LIBICONV_VERSION
- #define __lxml_iconv_version (_LIBICONV_VERSION << 8)
- #else
- #define __lxml_iconv_version 0
- #endif
- """
- # zlib isn't included automatically by libxml2's headers
- #long ZLIB_HEX_VERSION "__lxml_zlib_version"
- long LIBICONV_HEX_VERSION "__lxml_iconv_version"
- #ZLIB_COMPILED_VERSION = __unpackIntVersion(ZLIB_HEX_VERSION, base=0x10)
- ICONV_COMPILED_VERSION = __unpackIntVersion(LIBICONV_HEX_VERSION, base=0x100)[:2]
- cdef extern from "libxml/xmlversion.h":
- """
- static const char* const _lxml_lib_features[] = {
- #ifdef LIBXML_HTML_ENABLED
- "html",
- #endif
- #ifdef LIBXML_FTP_ENABLED
- "ftp",
- #endif
- #ifdef LIBXML_HTTP_ENABLED
- "http",
- #endif
- #ifdef LIBXML_CATALOG_ENABLED
- "catalog",
- #endif
- #ifdef LIBXML_XPATH_ENABLED
- "xpath",
- #endif
- #ifdef LIBXML_ICONV_ENABLED
- "iconv",
- #endif
- #ifdef LIBXML_ICU_ENABLED
- "icu",
- #endif
- #ifdef LIBXML_REGEXP_ENABLED
- "regexp",
- #endif
- #ifdef LIBXML_SCHEMAS_ENABLED
- "xmlschema",
- #endif
- #ifdef LIBXML_SCHEMATRON_ENABLED
- "schematron",
- #endif
- #ifdef LIBXML_ZLIB_ENABLED
- "zlib",
- #endif
- #ifdef LIBXML_LZMA_ENABLED
- "lzma",
- #endif
- 0
- };
- """
- const char* const* _LXML_LIB_FEATURES "_lxml_lib_features"
- cdef set _copy_lib_features():
- features = set()
- feature = _LXML_LIB_FEATURES
- while feature[0]:
- features.add(feature[0].decode('ASCII'))
- feature += 1
- return features
- LIBXML_COMPILED_FEATURES = _copy_lib_features()
- LIBXML_FEATURES = {
- feature_name for feature_id, feature_name in [
- #XML_WITH_THREAD = 1
- #XML_WITH_TREE = 2
- #XML_WITH_OUTPUT = 3
- #XML_WITH_PUSH = 4
- #XML_WITH_READER = 5
- #XML_WITH_PATTERN = 6
- #XML_WITH_WRITER = 7
- #XML_WITH_SAX1 = 8
- (xmlparser.XML_WITH_FTP, "ftp"), # XML_WITH_FTP = 9
- (xmlparser.XML_WITH_HTTP, "http"), # XML_WITH_HTTP = 10
- #XML_WITH_VALID = 11
- (xmlparser.XML_WITH_HTML, "html"), # XML_WITH_HTML = 12
- #XML_WITH_LEGACY = 13
- #XML_WITH_C14N = 14
- (xmlparser.XML_WITH_CATALOG, "catalog"), # XML_WITH_CATALOG = 15
- (xmlparser.XML_WITH_XPATH, "xpath"), # XML_WITH_XPATH = 16
- #XML_WITH_XPTR = 17
- #XML_WITH_XINCLUDE = 18
- (xmlparser.XML_WITH_ICONV, "iconv"), # XML_WITH_ICONV = 19
- #XML_WITH_ISO8859X = 20
- #XML_WITH_UNICODE = 21
- (xmlparser.XML_WITH_REGEXP, "regexp"), # XML_WITH_REGEXP = 22
- #XML_WITH_AUTOMATA = 23
- #XML_WITH_EXPR = 24
- (xmlparser.XML_WITH_SCHEMAS, "xmlschema"), # XML_WITH_SCHEMAS = 25
- (xmlparser.XML_WITH_SCHEMATRON, "schematron"), # XML_WITH_SCHEMATRON = 26
- #XML_WITH_MODULES = 27
- #XML_WITH_DEBUG = 28
- #XML_WITH_DEBUG_MEM = 29
- #XML_WITH_DEBUG_RUN = 30 # unused
- (xmlparser.XML_WITH_ZLIB, "zlib"), # XML_WITH_ZLIB = 31
- (xmlparser.XML_WITH_ICU, "icu"), # XML_WITH_ICU = 32
- (xmlparser.XML_WITH_LZMA, "lzma"), # XML_WITH_LZMA = 33
- ] if xmlparser.xmlHasFeature(feature_id)
- }
- cdef bint HAS_ZLIB_COMPRESSION = xmlparser.xmlHasFeature(xmlparser.XML_WITH_ZLIB)
- # class for temporary storage of Python references,
- # used e.g. for XPath results
- @cython.final
- @cython.internal
- cdef class _TempStore:
- cdef list _storage
- def __init__(self):
- self._storage = []
- cdef int add(self, obj) except -1:
- self._storage.append(obj)
- return 0
- cdef int clear(self) except -1:
- del self._storage[:]
- return 0
- # class for temporarily storing exceptions raised in extensions
- @cython.internal
- cdef class _ExceptionContext:
- cdef object _exc_info
- cdef int clear(self) except -1:
- self._exc_info = None
- return 0
- cdef void _store_raised(self) noexcept:
- try:
- self._exc_info = sys.exc_info()
- except BaseException as e:
- self._store_exception(e)
- finally:
- return # and swallow any further exceptions
- cdef int _store_exception(self, exception) except -1:
- self._exc_info = (exception, None, None)
- return 0
- cdef bint _has_raised(self) except -1:
- return self._exc_info is not None
- cdef int _raise_if_stored(self) except -1:
- if self._exc_info is None:
- return 0
- type, value, traceback = self._exc_info
- self._exc_info = None
- if value is None and traceback is None:
- raise type
- else:
- raise type, value, traceback
- # type of a function that steps from node to node
- ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*)
- ################################################################################
- # Include submodules
- include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
- include "apihelpers.pxi" # Private helper functions
- include "xmlerror.pxi" # Error and log handling
- ################################################################################
- # Public Python API
- @cython.final
- @cython.freelist(8)
- cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]:
- """Internal base class to reference a libxml document.
- When instances of this class are garbage collected, the libxml
- document is cleaned up.
- """
- cdef int _ns_counter
- cdef bytes _prefix_tail
- cdef xmlDoc* _c_doc
- cdef _BaseParser _parser
- def __dealloc__(self):
- # if there are no more references to the document, it is safe
- # to clean the whole thing up, as all nodes have a reference to
- # the document
- tree.xmlFreeDoc(self._c_doc)
- @cython.final
- cdef getroot(self):
- # return an element proxy for the document root
- cdef xmlNode* c_node
- c_node = tree.xmlDocGetRootElement(self._c_doc)
- if c_node is NULL:
- return None
- return _elementFactory(self, c_node)
- @cython.final
- cdef bint hasdoctype(self) noexcept:
- # DOCTYPE gets parsed into internal subset (xmlDTD*)
- return self._c_doc is not NULL and self._c_doc.intSubset is not NULL
- @cython.final
- cdef getdoctype(self):
- # get doctype info: root tag, public/system ID (or None if not known)
- cdef tree.xmlDtd* c_dtd
- cdef xmlNode* c_root_node
- public_id = None
- sys_url = None
- c_dtd = self._c_doc.intSubset
- if c_dtd is not NULL:
- if c_dtd.ExternalID is not NULL:
- public_id = funicode(c_dtd.ExternalID)
- if c_dtd.SystemID is not NULL:
- sys_url = funicode(c_dtd.SystemID)
- c_dtd = self._c_doc.extSubset
- if c_dtd is not NULL:
- if not public_id and c_dtd.ExternalID is not NULL:
- public_id = funicode(c_dtd.ExternalID)
- if not sys_url and c_dtd.SystemID is not NULL:
- sys_url = funicode(c_dtd.SystemID)
- c_root_node = tree.xmlDocGetRootElement(self._c_doc)
- if c_root_node is NULL:
- root_name = None
- else:
- root_name = funicode(c_root_node.name)
- return root_name, public_id, sys_url
- @cython.final
- cdef getxmlinfo(self):
- # return XML version and encoding (or None if not known)
- cdef xmlDoc* c_doc = self._c_doc
- if c_doc.version is NULL:
- version = None
- else:
- version = funicode(c_doc.version)
- if c_doc.encoding is NULL:
- encoding = None
- else:
- encoding = funicode(c_doc.encoding)
- return version, encoding
- @cython.final
- cdef isstandalone(self):
- # returns True for "standalone=true",
- # False for "standalone=false", None if not provided
- if self._c_doc.standalone == -1:
- return None
- else:
- return <bint>(self._c_doc.standalone == 1)
- @cython.final
- cdef bytes buildNewPrefix(self):
- # get a new unique prefix ("nsX") for this document
- cdef bytes ns
- if self._ns_counter < len(_PREFIX_CACHE):
- ns = _PREFIX_CACHE[self._ns_counter]
- else:
- ns = python.PyBytes_FromFormat("ns%d", self._ns_counter)
- if self._prefix_tail is not None:
- ns += self._prefix_tail
- self._ns_counter += 1
- if self._ns_counter < 0:
- # overflow!
- self._ns_counter = 0
- if self._prefix_tail is None:
- self._prefix_tail = b"A"
- else:
- self._prefix_tail += b"A"
- return ns
- @cython.final
- cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node,
- const_xmlChar* c_href, const_xmlChar* c_prefix,
- bint is_attribute) except NULL:
- """Get or create namespace structure for a node. Reuses the prefix if
- possible.
- """
- cdef xmlNs* c_ns
- cdef xmlNs* c_doc_ns
- cdef python.PyObject* dict_result
- if c_node.type != tree.XML_ELEMENT_NODE:
- assert c_node.type == tree.XML_ELEMENT_NODE, \
- "invalid node type %d, expected %d" % (
- c_node.type, tree.XML_ELEMENT_NODE)
- # look for existing ns declaration
- c_ns = _searchNsByHref(c_node, c_href, is_attribute)
- if c_ns is not NULL:
- if is_attribute and c_ns.prefix is NULL:
- # do not put namespaced attributes into the default
- # namespace as this would break serialisation
- pass
- else:
- return c_ns
- # none found => determine a suitable new prefix
- if c_prefix is NULL:
- dict_result = python.PyDict_GetItem(
- _DEFAULT_NAMESPACE_PREFIXES, <unsigned char*>c_href)
- if dict_result is not NULL:
- prefix = <object>dict_result
- else:
- prefix = self.buildNewPrefix()
- c_prefix = _xcstr(prefix)
- # make sure the prefix is not in use already
- while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL:
- prefix = self.buildNewPrefix()
- c_prefix = _xcstr(prefix)
- # declare the namespace and return it
- c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
- if c_ns is NULL:
- raise MemoryError()
- return c_ns
- @cython.final
- cdef int _setNodeNs(self, xmlNode* c_node, const_xmlChar* c_href) except -1:
- "Lookup namespace structure and set it for the node."
- c_ns = self._findOrBuildNodeNs(c_node, c_href, NULL, 0)
- tree.xmlSetNs(c_node, c_ns)
- cdef tuple __initPrefixCache():
- cdef int i
- return tuple([ python.PyBytes_FromFormat("ns%d", i)
- for i in range(26) ])
- cdef tuple _PREFIX_CACHE = __initPrefixCache()
- cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser):
- cdef _Document result
- result = _Document.__new__(_Document)
- result._c_doc = c_doc
- result._ns_counter = 0
- result._prefix_tail = None
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- result._parser = parser
- return result
- cdef object _find_invalid_public_id_characters = re.compile(
- ur"[^\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]+").search
- cdef class DocInfo:
- "Document information provided by parser and DTD."
- cdef _Document _doc
- def __cinit__(self, tree):
- "Create a DocInfo object for an ElementTree object or root Element."
- self._doc = _documentOrRaise(tree)
- root_name, public_id, system_url = self._doc.getdoctype()
- if not root_name and (public_id or system_url):
- raise ValueError, "Could not find root node"
- @property
- def root_name(self):
- """Returns the name of the root node as defined by the DOCTYPE."""
- root_name, public_id, system_url = self._doc.getdoctype()
- return root_name
- @cython.final
- cdef tree.xmlDtd* _get_c_dtd(self):
- """"Return the DTD. Create it if it does not yet exist."""
- cdef xmlDoc* c_doc = self._doc._c_doc
- cdef xmlNode* c_root_node
- cdef const_xmlChar* c_name
- if c_doc.intSubset:
- return c_doc.intSubset
- c_root_node = tree.xmlDocGetRootElement(c_doc)
- c_name = c_root_node.name if c_root_node else NULL
- return tree.xmlCreateIntSubset(c_doc, c_name, NULL, NULL)
- def clear(self):
- """Removes DOCTYPE and internal subset from the document."""
- cdef xmlDoc* c_doc = self._doc._c_doc
- cdef tree.xmlNode* c_dtd = <xmlNode*>c_doc.intSubset
- if c_dtd is NULL:
- return
- tree.xmlUnlinkNode(c_dtd)
- tree.xmlFreeNode(c_dtd)
- property public_id:
- """Public ID of the DOCTYPE.
- Mutable. May be set to a valid string or None. If a DTD does not
- exist, setting this variable (even to None) will create one.
- """
- def __get__(self):
- root_name, public_id, system_url = self._doc.getdoctype()
- return public_id
- def __set__(self, value):
- cdef xmlChar* c_value = NULL
- if value is not None:
- match = _find_invalid_public_id_characters(value)
- if match:
- raise ValueError, f'Invalid character(s) {match.group(0)!r} in public_id.'
- value = _utf8(value)
- c_value = tree.xmlStrdup(_xcstr(value))
- if not c_value:
- raise MemoryError()
- c_dtd = self._get_c_dtd()
- if not c_dtd:
- tree.xmlFree(c_value)
- raise MemoryError()
- if c_dtd.ExternalID:
- tree.xmlFree(<void*>c_dtd.ExternalID)
- c_dtd.ExternalID = c_value
- property system_url:
- """System ID of the DOCTYPE.
- Mutable. May be set to a valid string or None. If a DTD does not
- exist, setting this variable (even to None) will create one.
- """
- def __get__(self):
- root_name, public_id, system_url = self._doc.getdoctype()
- return system_url
- def __set__(self, value):
- cdef xmlChar* c_value = NULL
- if value is not None:
- bvalue = _utf8(value)
- # sys_url may be any valid unicode string that can be
- # enclosed in single quotes or quotes.
- if b"'" in bvalue and b'"' in bvalue:
- raise ValueError(
- 'System URL may not contain both single (\') and double quotes (").')
- c_value = tree.xmlStrdup(_xcstr(bvalue))
- if not c_value:
- raise MemoryError()
- c_dtd = self._get_c_dtd()
- if not c_dtd:
- tree.xmlFree(c_value)
- raise MemoryError()
- if c_dtd.SystemID:
- tree.xmlFree(<void*>c_dtd.SystemID)
- c_dtd.SystemID = c_value
- @property
- def xml_version(self):
- """Returns the XML version as declared by the document."""
- xml_version, encoding = self._doc.getxmlinfo()
- return xml_version
- @property
- def encoding(self):
- """Returns the encoding name as declared by the document."""
- xml_version, encoding = self._doc.getxmlinfo()
- return encoding
- @property
- def standalone(self):
- """Returns the standalone flag as declared by the document. The possible
- values are True (``standalone='yes'``), False
- (``standalone='no'`` or flag not provided in the declaration),
- and None (unknown or no declaration found). Note that a
- normal truth test on this value will always tell if the
- ``standalone`` flag was set to ``'yes'`` or not.
- """
- return self._doc.isstandalone()
- property URL:
- "The source URL of the document (or None if unknown)."
- def __get__(self):
- if self._doc._c_doc.URL is NULL:
- return None
- return _decodeFilename(self._doc._c_doc.URL)
- def __set__(self, url):
- url = _encodeFilename(url)
- c_oldurl = self._doc._c_doc.URL
- if url is None:
- self._doc._c_doc.URL = NULL
- else:
- self._doc._c_doc.URL = tree.xmlStrdup(_xcstr(url))
- if c_oldurl is not NULL:
- tree.xmlFree(<void*>c_oldurl)
- @property
- def doctype(self):
- """Returns a DOCTYPE declaration string for the document."""
- root_name, public_id, system_url = self._doc.getdoctype()
- if system_url:
- # If '"' in system_url, we must escape it with single
- # quotes, otherwise escape with double quotes. If url
- # contains both a single quote and a double quote, XML
- # standard is being violated.
- if '"' in system_url:
- quoted_system_url = f"'{system_url}'"
- else:
- quoted_system_url = f'"{system_url}"'
- if public_id:
- if system_url:
- return f'<!DOCTYPE {root_name} PUBLIC "{public_id}" {quoted_system_url}>'
- else:
- return f'<!DOCTYPE {root_name} PUBLIC "{public_id}">'
- elif system_url:
- return f'<!DOCTYPE {root_name} SYSTEM {quoted_system_url}>'
- elif self._doc.hasdoctype():
- return f'<!DOCTYPE {root_name}>'
- else:
- return ''
- @property
- def internalDTD(self):
- """Returns a DTD validator based on the internal subset of the document."""
- return _dtdFactory(self._doc._c_doc.intSubset)
- @property
- def externalDTD(self):
- """Returns a DTD validator based on the external subset of the document."""
- return _dtdFactory(self._doc._c_doc.extSubset)
- @cython.no_gc_clear
- cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
- """Element class.
- References a document object and a libxml node.
- By pointing to a Document instance, a reference is kept to
- _Document as long as there is some pointer to a node in it.
- """
- cdef _Document _doc
- cdef xmlNode* _c_node
- cdef object _tag
- def _init(self):
- """_init(self)
- Called after object initialisation. Custom subclasses may override
- this if they recursively call _init() in the superclasses.
- """
- @cython.linetrace(False)
- @cython.profile(False)
- def __dealloc__(self):
- #print("trying to free node:", <int>self._c_node)
- #displayNode(self._c_node, 0)
- if self._c_node is not NULL:
- _unregisterProxy(self)
- attemptDeallocation(self._c_node)
- # MANIPULATORS
- def __setitem__(self, x, value):
- """__setitem__(self, x, value)
- Replaces the given subelement index or slice.
- """
- cdef xmlNode* c_node = NULL
- cdef xmlNode* c_next
- cdef xmlDoc* c_source_doc
- cdef _Element element
- cdef bint left_to_right
- cdef Py_ssize_t slicelength = 0, step = 0
- _assertValidNode(self)
- if value is None:
- raise ValueError, "cannot assign None"
- if isinstance(x, slice):
- # slice assignment
- _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
- if step > 0:
- left_to_right = 1
- else:
- left_to_right = 0
- step = -step
- _replaceSlice(self, c_node, slicelength, step, left_to_right, value)
- return
- else:
- # otherwise: normal item assignment
- element = value
- _assertValidNode(element)
- c_node = _findChild(self._c_node, x)
- if c_node is NULL:
- raise IndexError, "list index out of range"
- c_source_doc = element._c_node.doc
- c_next = element._c_node.next
- _removeText(c_node.next)
- tree.xmlReplaceNode(c_node, element._c_node)
- _moveTail(c_next, element._c_node)
- moveNodeToDocument(self._doc, c_source_doc, element._c_node)
- if not attemptDeallocation(c_node):
- moveNodeToDocument(self._doc, c_node.doc, c_node)
- def __delitem__(self, x):
- """__delitem__(self, x)
- Deletes the given subelement or a slice.
- """
- cdef xmlNode* c_node = NULL
- cdef xmlNode* c_next
- cdef Py_ssize_t step = 0, slicelength = 0
- _assertValidNode(self)
- if isinstance(x, slice):
- # slice deletion
- if _isFullSlice(<slice>x):
- c_node = self._c_node.children
- if c_node is not NULL:
- if not _isElement(c_node):
- c_node = _nextElement(c_node)
- while c_node is not NULL:
- c_next = _nextElement(c_node)
- _removeNode(self._doc, c_node)
- c_node = c_next
- else:
- _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
- _deleteSlice(self._doc, c_node, slicelength, step)
- else:
- # item deletion
- c_node = _findChild(self._c_node, x)
- if c_node is NULL:
- raise IndexError, f"index out of range: {x}"
- _removeNode(self._doc, c_node)
- def __deepcopy__(self, memo):
- "__deepcopy__(self, memo)"
- return self.__copy__()
- def __copy__(self):
- "__copy__(self)"
- cdef xmlDoc* c_doc
- cdef xmlNode* c_node
- cdef _Document new_doc
- _assertValidNode(self)
- c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive
- new_doc = _documentFactory(c_doc, self._doc._parser)
- root = new_doc.getroot()
- if root is not None:
- return root
- # Comment/PI
- c_node = c_doc.children
- while c_node is not NULL and c_node.type != self._c_node.type:
- c_node = c_node.next
- if c_node is NULL:
- return None
- return _elementFactory(new_doc, c_node)
- def set(self, key, value):
- """set(self, key, value)
- Sets an element attribute.
- In HTML documents (not XML or XHTML), the value None is allowed and creates
- an attribute without value (just the attribute name).
- """
- _assertValidNode(self)
- _setAttributeValue(self, key, value)
- def append(self, _Element element not None):
- """append(self, element)
- Adds a subelement to the end of this element.
- """
- _assertValidNode(self)
- _assertValidNode(element)
- _appendChild(self, element)
- def addnext(self, _Element element not None):
- """addnext(self, element)
- Adds the element as a following sibling directly after this
- element.
- This is normally used to set a processing instruction or comment after
- the root node of a document. Note that tail text is automatically
- discarded when adding at the root level.
- """
- _assertValidNode(self)
- _assertValidNode(element)
- if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
- if element._c_node.type not in (tree.XML_PI_NODE, tree.XML_COMMENT_NODE):
- raise TypeError, "Only processing instructions and comments can be siblings of the root element"
- element.tail = None
- _appendSibling(self, element)
- def addprevious(self, _Element element not None):
- """addprevious(self, element)
- Adds the element as a preceding sibling directly before this
- element.
- This is normally used to set a processing instruction or comment
- before the root node of a document. Note that tail text is
- automatically discarded when adding at the root level.
- """
- _assertValidNode(self)
- _assertValidNode(element)
- if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
- if element._c_node.type != tree.XML_PI_NODE:
- if element._c_node.type != tree.XML_COMMENT_NODE:
- raise TypeError, "Only processing instructions and comments can be siblings of the root element"
- element.tail = None
- _prependSibling(self, element)
- def extend(self, elements):
- """extend(self, elements)
- Extends the current children by the elements in the iterable.
- """
- cdef _Element element
- _assertValidNode(self)
- for element in elements:
- if element is None:
- raise TypeError, "Node must not be None"
- _assertValidNode(element)
- _appendChild(self, element)
- def clear(self, bint keep_tail=False):
- """clear(self, keep_tail=False)
- Resets an element. This function removes all subelements, clears
- all attributes and sets the text and tail properties to None.
- Pass ``keep_tail=True`` to leave the tail text untouched.
- """
- cdef xmlAttr* c_attr
- cdef xmlAttr* c_attr_next
- cdef xmlNode* c_node
- cdef xmlNode* c_node_next
- _assertValidNode(self)
- c_node = self._c_node
- # remove self.text and self.tail
- _removeText(c_node.children)
- if not keep_tail:
- _removeText(c_node.next)
- # remove all attributes
- c_attr = c_node.properties
- if c_attr:
- c_node.properties = NULL
- tree.xmlFreePropList(c_attr)
- # remove all subelements
- c_node = c_node.children
- if c_node and not _isElement(c_node):
- c_node = _nextElement(c_node)
- while c_node is not NULL:
- c_node_next = _nextElement(c_node)
- _removeNode(self._doc, c_node)
- c_node = c_node_next
- def insert(self, index: int, _Element element not None):
- """insert(self, index, element)
- Inserts a subelement at the given position in this element
- """
- cdef xmlNode* c_node
- cdef xmlNode* c_next
- cdef xmlDoc* c_source_doc
- _assertValidNode(self)
- _assertValidNode(element)
- c_node = _findChild(self._c_node, index)
- if c_node is NULL:
- _appendChild(self, element)
- return
- # prevent cycles
- if _isAncestorOrSame(element._c_node, self._c_node):
- raise ValueError("cannot append parent to itself")
- c_source_doc = element._c_node.doc
- c_next = element._c_node.next
- tree.xmlAddPrevSibling(c_node, element._c_node)
- _moveTail(c_next, element._c_node)
- moveNodeToDocument(self._doc, c_source_doc, element._c_node)
- def remove(self, _Element element not None):
- """remove(self, element)
- Removes a matching subelement. Unlike the find methods, this
- method compares elements based on identity, not on tag value
- or contents.
- """
- cdef xmlNode* c_node
- cdef xmlNode* c_next
- _assertValidNode(self)
- _assertValidNode(element)
- c_node = element._c_node
- if c_node.parent is not self._c_node:
- raise ValueError, "Element is not a child of this node."
- c_next = element._c_node.next
- tree.xmlUnlinkNode(c_node)
- _moveTail(c_next, c_node)
- # fix namespace declarations
- moveNodeToDocument(self._doc, c_node.doc, c_node)
- def replace(self, _Element old_element not None,
- _Element new_element not None):
- """replace(self, old_element, new_element)
- Replaces a subelement with the element passed as second argument.
- """
- cdef xmlNode* c_old_node
- cdef xmlNode* c_old_next
- cdef xmlNode* c_new_node
- cdef xmlNode* c_new_next
- cdef xmlDoc* c_source_doc
- _assertValidNode(self)
- _assertValidNode(old_element)
- _assertValidNode(new_element)
- c_old_node = old_element._c_node
- if c_old_node.parent is not self._c_node:
- raise ValueError, "Element is not a child of this node."
- c_new_node = new_element._c_node
- # prevent cycles
- if _isAncestorOrSame(c_new_node, self._c_node):
- raise ValueError("cannot append parent to itself")
- # replace node
- c_old_next = c_old_node.next
- c_new_next = c_new_node.next
- c_source_doc = c_new_node.doc
- tree.xmlReplaceNode(c_old_node, c_new_node)
- _moveTail(c_new_next, c_new_node)
- _moveTail(c_old_next, c_old_node)
- moveNodeToDocument(self._doc, c_source_doc, c_new_node)
- # fix namespace declarations
- moveNodeToDocument(self._doc, c_old_node.doc, c_old_node)
- # PROPERTIES
- property tag:
- """Element tag
- """
- def __get__(self):
- if self._tag is not None:
- return self._tag
- _assertValidNode(self)
- self._tag = _namespacedName(self._c_node)
- return self._tag
- def __set__(self, value):
- cdef _BaseParser parser
- _assertValidNode(self)
- ns, name = _getNsTag(value)
- parser = self._doc._parser
- if parser is not None and parser._for_html:
- _htmlTagValidOrRaise(name)
- else:
- _tagValidOrRaise(name)
- self._tag = value
- tree.xmlNodeSetName(self._c_node, _xcstr(name))
- if ns is None:
- self._c_node.ns = NULL
- else:
- self._doc._setNodeNs(self._c_node, _xcstr(ns))
- @property
- def attrib(self):
- """Element attribute dictionary. Where possible, use get(), set(),
- keys(), values() and items() to access element attributes.
- """
- return _Attrib.__new__(_Attrib, self)
- property text:
- """Text before the first subelement. This is either a string or
- the value None, if there was no text.
- """
- def __get__(self):
- _assertValidNode(self)
- return _collectText(self._c_node.children)
- def __set__(self, value):
- _assertValidNode(self)
- if isinstance(value, QName):
- value = _resolveQNameText(self, value).decode('utf8')
- _setNodeText(self._c_node, value)
- # using 'del el.text' is the wrong thing to do
- #def __del__(self):
- # _setNodeText(self._c_node, None)
- property tail:
- """Text after this element's end tag, but before the next sibling
- element's start tag. This is either a string or the value None, if
- there was no text.
- """
- def __get__(self):
- _assertValidNode(self)
- return _collectText(self._c_node.next)
- def __set__(self, value):
- _assertValidNode(self)
- _setTailText(self._c_node, value)
- # using 'del el.tail' is the wrong thing to do
- #def __del__(self):
- # _setTailText(self._c_node, None)
- # not in ElementTree, read-only
- @property
- def prefix(self):
- """Namespace prefix or None.
- """
- if self._c_node.ns is not NULL:
- if self._c_node.ns.prefix is not NULL:
- return funicode(self._c_node.ns.prefix)
- return None
- # not in ElementTree, read-only
- property sourceline:
- """Original line number as found by the parser or None if unknown.
- """
- def __get__(self):
- cdef long line
- _assertValidNode(self)
- line = tree.xmlGetLineNo(self._c_node)
- return line if line > 0 else None
- def __set__(self, line):
- _assertValidNode(self)
- if line <= 0:
- self._c_node.line = 0
- else:
- self._c_node.line = line
- # not in ElementTree, read-only
- @property
- def nsmap(self):
- """Namespace prefix->URI mapping known in the context of this
- Element. This includes all namespace declarations of the
- parents.
- Note that changing the returned dict has no effect on the Element.
- """
- _assertValidNode(self)
- return _build_nsmap(self._c_node)
- # not in ElementTree, read-only
- property base:
- """The base URI of the Element (xml:base or HTML base URL).
- None if the base URI is unknown.
- Note that the value depends on the URL of the document that
- holds the Element if there is no xml:base attribute on the
- Element or its ancestors.
- Setting this property will set an xml:base attribute on the
- Element, regardless of the document type (XML or HTML).
- """
- def __get__(self):
- _assertValidNode(self)
- c_base = tree.xmlNodeGetBase(self._doc._c_doc, self._c_node)
- if c_base is NULL:
- if self._doc._c_doc.URL is NULL:
- return None
- return _decodeFilename(self._doc._c_doc.URL)
- try:
- base = _decodeFilename(c_base)
- finally:
- tree.xmlFree(c_base)
- return base
- def __set__(self, url):
- _assertValidNode(self)
- if url is None:
- c_base = <const_xmlChar*>NULL
- else:
- url = _encodeFilename(url)
- c_base = _xcstr(url)
- tree.xmlNodeSetBase(self._c_node, c_base)
- # ACCESSORS
- def __repr__(self):
- "__repr__(self)"
- return "<Element %s at 0x%x>" % (self.tag, id(self))
- def __getitem__(self, x):
- """Returns the subelement at the given position or the requested
- slice.
- """
- cdef xmlNode* c_node = NULL
- cdef Py_ssize_t step = 0, slicelength = 0
- cdef Py_ssize_t c, i
- cdef _node_to_node_function next_element
- cdef list result
- _assertValidNode(self)
- if isinstance(x, slice):
- # slicing
- if _isFullSlice(<slice>x):
- return _collectChildren(self)
- _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
- if c_node is NULL:
- return []
- if step > 0:
- next_element = _nextElement
- else:
- step = -step
- next_element = _previousElement
- result = []
- c = 0
- while c_node is not NULL and c < slicelength:
- result.append(_elementFactory(self._doc, c_node))
- c += 1
- for i in range(step):
- c_node = next_element(c_node)
- if c_node is NULL:
- break
- return result
- else:
- # indexing
- c_node = _findChild(self._c_node, x)
- if c_node is NULL:
- raise IndexError, "list index out of range"
- return _elementFactory(self._doc, c_node)
- def __len__(self):
- """__len__(self)
- Returns the number of subelements.
- """
- _assertValidNode(self)
- return _countElements(self._c_node.children)
- def __bool__(self):
- """__bool__(self)"""
- import warnings
- warnings.warn(
- "Truth-testing of elements was a source of confusion and will always "
- "return True in future versions. "
- "Use specific 'len(elem)' or 'elem is not None' test instead.",
- FutureWarning
- )
- # emulate old behaviour
- _assertValidNode(self)
- return _hasChild(self._c_node)
- def __contains__(self, element):
- "__contains__(self, element)"
- cdef xmlNode* c_node
- _assertValidNode(self)
- if not isinstance(element, _Element):
- return 0
- c_node = (<_Element>element)._c_node
- return c_node is not NULL and c_node.parent is self._c_node
- def __iter__(self):
- "__iter__(self)"
- return ElementChildIterator(self)
- def __reversed__(self):
- "__reversed__(self)"
- return ElementChildIterator(self, reversed=True)
- def index(self, child: _Element, start: int = None, stop: int = None):
- """index(self, child, start=None, stop=None)
- Find the position of the child within the parent.
- This method is not part of the original ElementTree API.
- """
- cdef Py_ssize_t k, l
- cdef Py_ssize_t c_start, c_stop
- cdef xmlNode* c_child
- cdef xmlNode* c_start_node
- _assertValidNode(self)
- _assertValidNode(child)
- c_child = child._c_node
- if c_child.parent is not self._c_node:
- raise ValueError, "Element is not a child of this node."
- # handle the unbounded search straight away (normal case)
- if stop is None and (start is None or start == 0):
- k = 0
- c_child = c_child.prev
- while c_child is not NULL:
- if _isElement(c_child):
- k += 1
- c_child = c_child.prev
- return k
- # check indices
- if start is None:
- c_start = 0
- else:
- c_start = start
- if stop is None:
- c_stop = 0
- else:
- c_stop = stop
- if c_stop == 0 or \
- c_start >= c_stop and (c_stop > 0 or c_start < 0):
- raise ValueError, "list.index(x): x not in slice"
- # for negative slice indices, check slice before searching index
- if c_start < 0 or c_stop < 0:
- # start from right, at most up to leftmost(c_start, c_stop)
- if c_start < c_stop:
- k = -c_start
- else:
- k = -c_stop
- c_start_node = self._c_node.last
- l = 1
- while c_start_node != c_child and l < k:
- if _isElement(c_start_node):
- l += 1
- c_start_node = c_start_node.prev
- if c_start_node == c_child:
- # found! before slice end?
- if c_stop < 0 and l <= -c_stop:
- raise ValueError, "list.index(x): x not in slice"
- elif c_start < 0:
- raise ValueError, "list.index(x): x not in slice"
- # now determine the index backwards from child
- c_child = c_child.prev
- k = 0
- if c_stop > 0:
- # we can optimize: stop after c_stop elements if not found
- while c_child != NULL and k < c_stop:
- if _isElement(c_child):
- k += 1
- c_child = c_child.prev
- if k < c_stop:
- return k
- else:
- # traverse all
- while c_child != NULL:
- if _isElement(c_child):
- k = k + 1
- c_child = c_child.prev
- if c_start > 0:
- if k >= c_start:
- return k
- else:
- return k
- if c_start != 0 or c_stop != 0:
- raise ValueError, "list.index(x): x not in slice"
- else:
- raise ValueError, "list.index(x): x not in list"
- def get(self, key, default=None):
- """get(self, key, default=None)
- Gets an element attribute.
- """
- _assertValidNode(self)
- return _getAttributeValue(self, key, default)
- def keys(self):
- """keys(self)
- Gets a list of attribute names. The names are returned in an
- arbitrary order (just like for an ordinary Python dictionary).
- """
- _assertValidNode(self)
- return _collectAttributes(self._c_node, 1)
- def values(self):
- """values(self)
- Gets element attribute values as a sequence of strings. The
- attributes are returned in an arbitrary order.
- """
- _assertValidNode(self)
- return _collectAttributes(self._c_node, 2)
- def items(self):
- """items(self)
- Gets element attributes, as a sequence. The attributes are returned in
- an arbitrary order.
- """
- _assertValidNode(self)
- return _collectAttributes(self._c_node, 3)
- def getchildren(self):
- """getchildren(self)
- Returns all direct children. The elements are returned in document
- order.
- :deprecated: Note that this method has been deprecated as of
- ElementTree 1.3 and lxml 2.0. New code should use
- ``list(element)`` or simply iterate over elements.
- """
- _assertValidNode(self)
- return _collectChildren(self)
- def getparent(self):
- """getparent(self)
- Returns the parent of this element or None for the root element.
- """
- cdef xmlNode* c_node
- #_assertValidNode(self) # not needed
- c_node = _parentElement(self._c_node)
- if c_node is NULL:
- return None
- return _elementFactory(self._doc, c_node)
- def getnext(self):
- """getnext(self)
- Returns the following sibling of this element or None.
- """
- cdef xmlNode* c_node
- #_assertValidNode(self) # not needed
- c_node = _nextElement(self._c_node)
- if c_node is NULL:
- return None
- return _elementFactory(self._doc, c_node)
- def getprevious(self):
- """getprevious(self)
- Returns the preceding sibling of this element or None.
- """
- cdef xmlNode* c_node
- #_assertValidNode(self) # not needed
- c_node = _previousElement(self._c_node)
- if c_node is NULL:
- return None
- return _elementFactory(self._doc, c_node)
- def itersiblings(self, tag=None, *tags, preceding=False):
- """itersiblings(self, tag=None, *tags, preceding=False)
- Iterate over the following or preceding siblings of this element.
- The direction is determined by the 'preceding' keyword which
- defaults to False, i.e. forward iteration over the following
- siblings. When True, the iterator yields the preceding
- siblings in reverse document order, i.e. starting right before
- the current element and going backwards.
- Can be restricted to find only elements with specific tags,
- see `iter`.
- """
- if preceding:
- if self._c_node and not self._c_node.prev:
- return ITER_EMPTY
- elif self._c_node and not self._c_node.next:
- return ITER_EMPTY
- if tag is not None:
- tags += (tag,)
- return SiblingsIterator(self, tags, preceding=preceding)
- def iterancestors(self, tag=None, *tags):
- """iterancestors(self, tag=None, *tags)
- Iterate over the ancestors of this element (from parent to parent).
- Can be restricted to find only elements with specific tags,
- see `iter`.
- """
- if self._c_node and not self._c_node.parent:
- return ITER_EMPTY
- if tag is not None:
- tags += (tag,)
- return AncestorsIterator(self, tags)
- def iterdescendants(self, tag=None, *tags):
- """iterdescendants(self, tag=None, *tags)
- Iterate over the descendants of this element in document order.
- As opposed to ``el.iter()``, this iterator does not yield the element
- itself. The returned elements can be restricted to find only elements
- with specific tags, see `iter`.
- """
- if self._c_node and not self._c_node.children:
- return ITER_EMPTY
- if tag is not None:
- tags += (tag,)
- return ElementDepthFirstIterator(self, tags, inclusive=False)
- def iterchildren(self, tag=None, *tags, reversed=False):
- """iterchildren(self, tag=None, *tags, reversed=False)
- Iterate over the children of this element.
- As opposed to using normal iteration on this element, the returned
- elements can be reversed with the 'reversed' keyword and restricted
- to find only elements with specific tags, see `iter`.
- """
- if self._c_node and not self._c_node.children:
- return ITER_EMPTY
- if tag is not None:
- tags += (tag,)
- return ElementChildIterator(self, tags, reversed=reversed)
- def getroottree(self):
- """getroottree(self)
- Return an ElementTree for the root node of the document that
- contains this element.
- This is the same as following element.getparent() up the tree until it
- returns None (for the root element) and then build an ElementTree for
- the last parent that was returned."""
- _assertValidDoc(self._doc)
- return _elementTreeFactory(self._doc, None)
- def getiterator(self, tag=None, *tags):
- """getiterator(self, tag=None, *tags)
- Returns a sequence or iterator of all elements in the subtree in
- document order (depth first pre-order), starting with this
- element.
- Can be restricted to find only elements with specific tags,
- see `iter`.
- :deprecated: Note that this method is deprecated as of
- ElementTree 1.3 and lxml 2.0. It returns an iterator in
- lxml, which diverges from the original ElementTree
- behaviour. If you want an efficient iterator, use the
- ``element.iter()`` method instead. You should only use this
- method in new code if you require backwards compatibility
- with older versions of lxml or ElementTree.
- """
- if tag is not None:
- tags += (tag,)
- return ElementDepthFirstIterator(self, tags)
- def iter(self, tag=None, *tags):
- """iter(self, tag=None, *tags)
- Iterate over all elements in the subtree in document order (depth
- first pre-order), starting with this element.
- Can be restricted to find only elements with specific tags:
- pass ``"{ns}localname"`` as tag. Either or both of ``ns`` and
- ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty
- for no namespace. ``"localname"`` is equivalent to ``"{}localname"``
- (i.e. no namespace) but ``"*"`` is ``"{*}*"`` (any or no namespace),
- not ``"{}*"``.
- You can also pass the Element, Comment, ProcessingInstruction and
- Entity factory functions to look only for the specific element type.
- Passing multiple tags (or a sequence of tags) instead of a single tag
- will let the iterator return all elements matching any of these tags,
- in document order.
- """
- if tag is not None:
- tags += (tag,)
- return ElementDepthFirstIterator(self, tags)
- def itertext(self, tag=None, *tags, with_tail=True):
- """itertext(self, tag=None, *tags, with_tail=True)
- Iterates over the text content of a subtree.
- You can pass tag names to restrict text content to specific elements,
- see `iter`.
- You can set the ``with_tail`` keyword argument to ``False`` to skip
- over tail text.
- """
- if tag is not None:
- tags += (tag,)
- return ElementTextIterator(self, tags, with_tail=with_tail)
- def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
- """makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
- Creates a new element associated with the same document.
- """
- _assertValidDoc(self._doc)
- return _makeElement(_tag, NULL, self._doc, None, None, None,
- attrib, nsmap, _extra)
- def find(self, path, namespaces=None):
- """find(self, path, namespaces=None)
- Finds the first matching subelement, by tag name or path.
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- if isinstance(path, QName):
- path = (<QName>path).text
- return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
- def findtext(self, path, default=None, namespaces=None):
- """findtext(self, path, default=None, namespaces=None)
- Finds text for the first matching subelement, by tag name or path.
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- if isinstance(path, QName):
- path = (<QName>path).text
- return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self))
- def findall(self, path, namespaces=None):
- """findall(self, path, namespaces=None)
- Finds all matching subelements, by tag name or path.
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- if isinstance(path, QName):
- path = (<QName>path).text
- return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
- def iterfind(self, path, namespaces=None):
- """iterfind(self, path, namespaces=None)
- Iterates over all matching subelements, by tag name or path.
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- if isinstance(path, QName):
- path = (<QName>path).text
- return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
- def xpath(self, _path, *, namespaces=None, extensions=None,
- smart_strings=True, **_variables):
- """xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
- Evaluate an xpath expression using the element as context node.
- """
- evaluator = XPathElementEvaluator(self, namespaces=namespaces,
- extensions=extensions,
- smart_strings=smart_strings)
- return evaluator(_path, **_variables)
- def cssselect(self, expr, *, translator='xml'):
- """
- Run the CSS expression on this element and its children,
- returning a list of the results.
- Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
- that pre-compiling the expression can provide a substantial
- speedup.
- """
- # Do the import here to make the dependency optional.
- from lxml.cssselect import CSSSelector
- return CSSSelector(expr, translator=translator)(self)
- @cython.linetrace(False)
- cdef _Element _elementFactory(_Document doc, xmlNode* c_node):
- cdef _Element result
- result = getProxy(c_node)
- if result is not None:
- return result
- if c_node is NULL:
- return None
- element_class = <type> LOOKUP_ELEMENT_CLASS(
- ELEMENT_CLASS_LOOKUP_STATE, doc, c_node)
- if type(element_class) is not type:
- if not isinstance(element_class, type):
- raise TypeError(f"Element class is not a type, got {type(element_class)}")
- if hasProxy(c_node):
- # prevent re-entry race condition - we just called into Python
- return getProxy(c_node)
- result = element_class.__new__(element_class)
- if hasProxy(c_node):
- # prevent re-entry race condition - we just called into Python
- result._c_node = NULL
- return getProxy(c_node)
- _registerProxy(result, doc, c_node)
- if element_class is not _Element:
- result._init()
- return result
- @cython.internal
- cdef class __ContentOnlyElement(_Element):
- cdef int _raiseImmutable(self) except -1:
- raise TypeError, "this element does not have children or attributes"
- def set(self, key, value):
- "set(self, key, value)"
- self._raiseImmutable()
- def append(self, value):
- "append(self, value)"
- self._raiseImmutable()
- def insert(self, index, value):
- "insert(self, index, value)"
- self._raiseImmutable()
- def __setitem__(self, index, value):
- "__setitem__(self, index, value)"
- self._raiseImmutable()
- @property
- def attrib(self):
- return IMMUTABLE_EMPTY_MAPPING
- property text:
- def __get__(self):
- _assertValidNode(self)
- return funicodeOrEmpty(self._c_node.content)
- def __set__(self, value):
- cdef tree.xmlDict* c_dict
- _assertValidNode(self)
- if value is None:
- c_text = <const_xmlChar*>NULL
- else:
- value = _utf8(value)
- c_text = _xcstr(value)
- tree.xmlNodeSetContent(self._c_node, c_text)
- # ACCESSORS
- def __getitem__(self, x):
- "__getitem__(self, x)"
- if isinstance(x, slice):
- return []
- else:
- raise IndexError, "list index out of range"
- def __len__(self):
- "__len__(self)"
- return 0
- def get(self, key, default=None):
- "get(self, key, default=None)"
- return None
- def keys(self):
- "keys(self)"
- return []
- def items(self):
- "items(self)"
- return []
- def values(self):
- "values(self)"
- return []
- cdef class _Comment(__ContentOnlyElement):
- @property
- def tag(self):
- return Comment
- def __repr__(self):
- return "<!--%s-->" % self.text
- cdef class _ProcessingInstruction(__ContentOnlyElement):
- @property
- def tag(self):
- return ProcessingInstruction
- property target:
- # not in ElementTree
- def __get__(self):
- _assertValidNode(self)
- return funicode(self._c_node.name)
- def __set__(self, value):
- _assertValidNode(self)
- value = _utf8(value)
- c_text = _xcstr(value)
- tree.xmlNodeSetName(self._c_node, c_text)
- def __repr__(self):
- text = self.text
- if text:
- return "<?%s %s?>" % (self.target, text)
- else:
- return "<?%s?>" % self.target
- def get(self, key, default=None):
- """get(self, key, default=None)
- Try to parse pseudo-attributes from the text content of the
- processing instruction, search for one with the given key as
- name and return its associated value.
- Note that this is only a convenience method for the most
- common case that all text content is structured in
- attribute-like name-value pairs with properly quoted values.
- It is not guaranteed to work for all possible text content.
- """
- return self.attrib.get(key, default)
- @property
- def attrib(self):
- """Returns a dict containing all pseudo-attributes that can be
- parsed from the text content of this processing instruction.
- Note that modifying the dict currently has no effect on the
- XML node, although this is not guaranteed to stay this way.
- """
- return { attr : (value1 or value2)
- for attr, value1, value2 in _FIND_PI_ATTRIBUTES(' ' + self.text) }
- cdef object _FIND_PI_ATTRIBUTES = re.compile(r'\s+(\w+)\s*=\s*(?:\'([^\']*)\'|"([^"]*)")', re.U).findall
- cdef class _Entity(__ContentOnlyElement):
- @property
- def tag(self):
- return Entity
- property name:
- # not in ElementTree
- def __get__(self):
- _assertValidNode(self)
- return funicode(self._c_node.name)
- def __set__(self, value):
- _assertValidNode(self)
- value_utf = _utf8(value)
- if b'&' in value_utf or b';' in value_utf:
- raise ValueError, f"Invalid entity name '{value}'"
- tree.xmlNodeSetName(self._c_node, _xcstr(value_utf))
- @property
- def text(self):
- # FIXME: should this be None or '&[VALUE];' or the resolved
- # entity value ?
- _assertValidNode(self)
- return f'&{funicode(self._c_node.name)};'
- def __repr__(self):
- return "&%s;" % self.name
- cdef class QName:
- """QName(text_or_uri_or_element, tag=None)
- QName wrapper for qualified XML names.
- Pass a tag name by itself or a namespace URI and a tag name to
- create a qualified name. Alternatively, pass an Element to
- extract its tag name. ``None`` as first argument is ignored in
- order to allow for generic 2-argument usage.
- The ``text`` property holds the qualified name in
- ``{namespace}tagname`` notation. The ``namespace`` and
- ``localname`` properties hold the respective parts of the tag
- name.
- You can pass QName objects wherever a tag name is expected. Also,
- setting Element text from a QName will resolve the namespace prefix
- on assignment and set a qualified text value. This is helpful in XML
- languages like SOAP or XML-Schema that use prefixed tag names in
- their text content.
- """
- cdef readonly unicode text
- cdef readonly unicode localname
- cdef readonly unicode namespace
- def __init__(self, text_or_uri_or_element, tag=None):
- if text_or_uri_or_element is None:
- # Allow None as no namespace.
- text_or_uri_or_element, tag = tag, None
- if not _isString(text_or_uri_or_element):
- if isinstance(text_or_uri_or_element, _Element):
- text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag
- if not _isString(text_or_uri_or_element):
- raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
- elif isinstance(text_or_uri_or_element, QName):
- text_or_uri_or_element = (<QName>text_or_uri_or_element).text
- elif text_or_uri_or_element is not None:
- text_or_uri_or_element = unicode(text_or_uri_or_element)
- else:
- raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
- ns_utf, tag_utf = _getNsTag(text_or_uri_or_element)
- if tag is not None:
- # either ('ns', 'tag') or ('{ns}oldtag', 'newtag')
- if ns_utf is None:
- ns_utf = tag_utf # case 1: namespace ended up as tag name
- tag_utf = _utf8(tag)
- _tagValidOrRaise(tag_utf)
- self.localname = (<bytes>tag_utf).decode('utf8')
- if ns_utf is None:
- self.namespace = None
- self.text = self.localname
- else:
- self.namespace = (<bytes>ns_utf).decode('utf8')
- self.text = "{%s}%s" % (self.namespace, self.localname)
- def __str__(self):
- return self.text
- def __hash__(self):
- return hash(self.text)
- def __richcmp__(self, other, int op):
- try:
- if type(other) is QName:
- other = (<QName>other).text
- elif not isinstance(other, unicode):
- other = unicode(other)
- except (ValueError, UnicodeDecodeError):
- return NotImplemented
- return python.PyObject_RichCompare(self.text, other, op)
- cdef public class _ElementTree [ type LxmlElementTreeType,
- object LxmlElementTree ]:
- cdef _Document _doc
- cdef _Element _context_node
- # Note that _doc is only used to store the original document if we do not
- # have a _context_node. All methods should prefer self._context_node._doc
- # to honour tree restructuring. _doc can happily be None!
- @cython.final
- cdef int _assertHasRoot(self) except -1:
- """We have to take care here: the document may not have a root node!
- This can happen if ElementTree() is called without any argument and
- the caller 'forgets' to call parse() afterwards, so this is a bug in
- the caller program.
- """
- assert self._context_node is not None, \
- "ElementTree not initialized, missing root"
- return 0
- def parse(self, source, _BaseParser parser=None, *, base_url=None):
- """parse(self, source, parser=None, base_url=None)
- Updates self with the content of source and returns its root.
- """
- cdef _Document doc = None
- try:
- doc = _parseDocument(source, parser, base_url)
- except _TargetParserResult as result_container:
- # raises a TypeError if we don't get an _Element
- self._context_node = result_container.result
- else:
- self._context_node = doc.getroot()
- self._doc = None if self._context_node is not None else doc
- return self._context_node
- def _setroot(self, _Element root not None):
- """_setroot(self, root)
- Relocate the ElementTree to a new root node.
- """
- _assertValidNode(root)
- if root._c_node.type != tree.XML_ELEMENT_NODE:
- raise TypeError, "Only elements can be the root of an ElementTree"
- self._context_node = root
- self._doc = None
- def getroot(self):
- """getroot(self)
- Gets the root element for this tree.
- """
- return self._context_node
- def __copy__(self):
- return _elementTreeFactory(self._doc, self._context_node)
- def __deepcopy__(self, memo):
- cdef _Element root
- cdef _Document doc
- cdef xmlDoc* c_doc
- if self._context_node is not None:
- root = self._context_node.__copy__()
- assert root is not None
- _assertValidNode(root)
- _copyNonElementSiblings(self._context_node._c_node, root._c_node)
- return _elementTreeFactory(None, root)
- elif self._doc is not None:
- _assertValidDoc(self._doc)
- c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1)
- if c_doc is NULL:
- raise MemoryError()
- doc = _documentFactory(c_doc, self._doc._parser)
- return _elementTreeFactory(doc, None)
- else:
- # so what ...
- return self
- # not in ElementTree
- @property
- def docinfo(self) -> DocInfo:
- """Information about the document provided by parser and DTD."""
- self._assertHasRoot()
- return DocInfo(self._context_node._doc)
- # not in ElementTree, read-only
- @property
- def parser(self):
- """The parser that was used to parse the document in this ElementTree.
- """
- if self._context_node is not None and \
- self._context_node._doc is not None:
- return self._context_node._doc._parser
- if self._doc is not None:
- return self._doc._parser
- return None
- def write(self, file, *, encoding=None, method="xml",
- bint pretty_print=False, xml_declaration=None, bint with_tail=True,
- standalone=None, doctype=None, compression=0,
- bint exclusive=False, inclusive_ns_prefixes=None,
- bint with_comments=True, bint strip_text=False,
- docstring=None):
- """write(self, file, encoding=None, method="xml",
- pretty_print=False, xml_declaration=None, with_tail=True,
- standalone=None, doctype=None, compression=0,
- exclusive=False, inclusive_ns_prefixes=None,
- with_comments=True, strip_text=False)
- Write the tree to a filename, file or file-like object.
- Defaults to ASCII encoding and writing a declaration as needed.
- The keyword argument 'method' selects the output method:
- 'xml', 'html', 'text', 'c14n' or 'c14n2'. Default is 'xml'.
- With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
- ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
- C14N, include comments, and list the inclusive prefixes respectively.
- With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
- ``strip_text`` options control the output of comments and text space
- according to C14N 2.0.
- Passing a boolean value to the ``standalone`` option will
- output an XML declaration with the corresponding
- ``standalone`` flag.
- The ``doctype`` option allows passing in a plain string that will
- be serialised before the XML tree. Note that passing in non
- well-formed content here will make the XML output non well-formed.
- Also, an existing doctype in the document tree will not be removed
- when serialising an ElementTree instance.
- The ``compression`` option enables GZip compression level 1-9.
- The ``inclusive_ns_prefixes`` should be a list of namespace strings
- (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
- during exclusive C14N serialisation. This parameter is ignored if
- exclusive mode=False.
- If exclusive=True and no list is provided, a namespace will only be
- rendered if it is used by the immediate parent or one of its attributes
- and its prefix and values have not already been rendered by an ancestor
- of the namespace node's parent element.
- """
- cdef bint write_declaration
- cdef int is_standalone
- self._assertHasRoot()
- _assertValidNode(self._context_node)
- if compression is None or compression < 0:
- compression = 0
- # C14N serialisation
- if method in ('c14n', 'c14n2'):
- if encoding is not None:
- raise ValueError("Cannot specify encoding with C14N")
- if xml_declaration:
- raise ValueError("Cannot enable XML declaration in C14N")
- if method == 'c14n':
- _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
- compression, inclusive_ns_prefixes)
- else: # c14n2
- with _open_utf8_file(file, compression=compression) as f:
- target = C14NWriterTarget(
- f.write, with_comments=with_comments, strip_text=strip_text)
- _tree_to_target(self, target)
- return
- if not with_comments:
- raise ValueError("Can only discard comments in C14N serialisation")
- # suppress decl. in default case (purely for ElementTree compatibility)
- if xml_declaration is not None:
- write_declaration = xml_declaration
- if encoding is None:
- encoding = 'ASCII'
- else:
- encoding = encoding.upper()
- elif encoding is None:
- encoding = 'ASCII'
- write_declaration = 0
- else:
- encoding = encoding.upper()
- write_declaration = encoding not in (
- 'US-ASCII', 'ASCII', 'UTF8', 'UTF-8')
- if standalone is None:
- is_standalone = -1
- elif standalone:
- write_declaration = 1
- is_standalone = 1
- else:
- write_declaration = 1
- is_standalone = 0
- if docstring is not None and doctype is None:
- import warnings
- warnings.warn(
- "The 'docstring' option is deprecated. Use 'doctype' instead.",
- DeprecationWarning)
- doctype = docstring
- _tofilelike(file, self._context_node, encoding, doctype, method,
- write_declaration, 1, pretty_print, with_tail,
- is_standalone, compression)
- def getpath(self, _Element element not None):
- """getpath(self, element)
- Returns a structural, absolute XPath expression to find the element.
- For namespaced elements, the expression uses prefixes from the
- document, which therefore need to be provided in order to make any
- use of the expression in XPath.
- Also see the method getelementpath(self, element), which returns a
- self-contained ElementPath expression.
- """
- cdef _Document doc
- cdef _Element root
- cdef xmlDoc* c_doc
- _assertValidNode(element)
- if self._context_node is not None:
- root = self._context_node
- doc = root._doc
- elif self._doc is not None:
- doc = self._doc
- root = doc.getroot()
- else:
- raise ValueError, "Element is not in this tree."
- _assertValidDoc(doc)
- _assertValidNode(root)
- if element._doc is not doc:
- raise ValueError, "Element is not in this tree."
- c_doc = _fakeRootDoc(doc._c_doc, root._c_node)
- c_path = tree.xmlGetNodePath(element._c_node)
- _destroyFakeDoc(doc._c_doc, c_doc)
- if c_path is NULL:
- raise MemoryError()
- path = funicode(c_path)
- tree.xmlFree(c_path)
- return path
- def getelementpath(self, _Element element not None):
- """getelementpath(self, element)
- Returns a structural, absolute ElementPath expression to find the
- element. This path can be used in the .find() method to look up
- the element, provided that the elements along the path and their
- list of immediate children were not modified in between.
- ElementPath has the advantage over an XPath expression (as returned
- by the .getpath() method) that it does not require additional prefix
- declarations. It is always self-contained.
- """
- cdef _Element root
- cdef Py_ssize_t count
- _assertValidNode(element)
- if element._c_node.type != tree.XML_ELEMENT_NODE:
- raise ValueError, "input is not an Element"
- if self._context_node is not None:
- root = self._context_node
- elif self._doc is not None:
- root = self._doc.getroot()
- else:
- raise ValueError, "Element is not in this tree"
- _assertValidNode(root)
- if element._doc is not root._doc:
- raise ValueError, "Element is not in this tree"
- path = []
- c_element = element._c_node
- while c_element is not root._c_node:
- c_name = c_element.name
- c_href = _getNs(c_element)
- tag = _namespacedNameFromNsName(c_href, c_name)
- if c_href is NULL:
- c_href = <const_xmlChar*>b'' # no namespace (NULL is wildcard)
- # use tag[N] if there are preceding siblings with the same tag
- count = 0
- c_node = c_element.prev
- while c_node is not NULL:
- if c_node.type == tree.XML_ELEMENT_NODE:
- if _tagMatches(c_node, c_href, c_name):
- count += 1
- c_node = c_node.prev
- if count:
- tag = f'{tag}[{count+1}]'
- else:
- # use tag[1] if there are following siblings with the same tag
- c_node = c_element.next
- while c_node is not NULL:
- if c_node.type == tree.XML_ELEMENT_NODE:
- if _tagMatches(c_node, c_href, c_name):
- tag += '[1]'
- break
- c_node = c_node.next
- path.append(tag)
- c_element = c_element.parent
- if c_element is NULL or c_element.type != tree.XML_ELEMENT_NODE:
- raise ValueError, "Element is not in this tree."
- if not path:
- return '.'
- path.reverse()
- return '/'.join(path)
- def getiterator(self, tag=None, *tags):
- """getiterator(self, *tags, tag=None)
- Returns a sequence or iterator of all elements in document order
- (depth first pre-order), starting with the root element.
- Can be restricted to find only elements with specific tags,
- see `_Element.iter`.
- :deprecated: Note that this method is deprecated as of
- ElementTree 1.3 and lxml 2.0. It returns an iterator in
- lxml, which diverges from the original ElementTree
- behaviour. If you want an efficient iterator, use the
- ``tree.iter()`` method instead. You should only use this
- method in new code if you require backwards compatibility
- with older versions of lxml or ElementTree.
- """
- root = self.getroot()
- if root is None:
- return ITER_EMPTY
- if tag is not None:
- tags += (tag,)
- return root.getiterator(*tags)
- def iter(self, tag=None, *tags):
- """iter(self, tag=None, *tags)
- Creates an iterator for the root element. The iterator loops over
- all elements in this tree, in document order. Note that siblings
- of the root element (comments or processing instructions) are not
- returned by the iterator.
- Can be restricted to find only elements with specific tags,
- see `_Element.iter`.
- """
- root = self.getroot()
- if root is None:
- return ITER_EMPTY
- if tag is not None:
- tags += (tag,)
- return root.iter(*tags)
- def find(self, path, namespaces=None):
- """find(self, path, namespaces=None)
- Finds the first toplevel element with given tag. Same as
- ``tree.getroot().find(path)``.
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- self._assertHasRoot()
- root = self.getroot()
- if _isString(path):
- if path[:1] == "/":
- path = "." + path
- from warnings import warn
- warn(
- "This search incorrectly ignores the root element, and will be "
- "fixed in a future version. If you rely on the current "
- f"behaviour, change it to {path!r}",
- FutureWarning, stacklevel=1
- )
- return root.find(path, namespaces)
- def findtext(self, path, default=None, namespaces=None):
- """findtext(self, path, default=None, namespaces=None)
- Finds the text for the first element matching the ElementPath
- expression. Same as getroot().findtext(path)
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- self._assertHasRoot()
- root = self.getroot()
- if _isString(path):
- if path[:1] == "/":
- path = "." + path
- from warnings import warn
- warn(
- "This search incorrectly ignores the root element, and will be "
- "fixed in a future version. If you rely on the current "
- f"behaviour, change it to {path!r}",
- FutureWarning, stacklevel=1
- )
- return root.findtext(path, default, namespaces)
- def findall(self, path, namespaces=None):
- """findall(self, path, namespaces=None)
- Finds all elements matching the ElementPath expression. Same as
- getroot().findall(path).
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- self._assertHasRoot()
- root = self.getroot()
- if _isString(path):
- if path[:1] == "/":
- path = "." + path
- from warnings import warn
- warn(
- "This search incorrectly ignores the root element, and will be "
- "fixed in a future version. If you rely on the current "
- f"behaviour, change it to {path!r}",
- FutureWarning, stacklevel=1
- )
- return root.findall(path, namespaces)
- def iterfind(self, path, namespaces=None):
- """iterfind(self, path, namespaces=None)
- Iterates over all elements matching the ElementPath expression.
- Same as getroot().iterfind(path).
- The optional ``namespaces`` argument accepts a
- prefix-to-namespace mapping that allows the usage of XPath
- prefixes in the path expression.
- """
- self._assertHasRoot()
- root = self.getroot()
- if _isString(path):
- if path[:1] == "/":
- path = "." + path
- from warnings import warn
- warn(
- "This search incorrectly ignores the root element, and will be "
- "fixed in a future version. If you rely on the current "
- f"behaviour, change it to {path!r}",
- FutureWarning, stacklevel=1
- )
- return root.iterfind(path, namespaces)
- def xpath(self, _path, *, namespaces=None, extensions=None,
- smart_strings=True, **_variables):
- """xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
- XPath evaluate in context of document.
- ``namespaces`` is an optional dictionary with prefix to namespace URI
- mappings, used by XPath. ``extensions`` defines additional extension
- functions.
- Returns a list (nodeset), or bool, float or string.
- In case of a list result, return Element for element nodes,
- string for text and attribute values.
- Note: if you are going to apply multiple XPath expressions
- against the same document, it is more efficient to use
- XPathEvaluator directly.
- """
- self._assertHasRoot()
- evaluator = XPathDocumentEvaluator(self, namespaces=namespaces,
- extensions=extensions,
- smart_strings=smart_strings)
- return evaluator(_path, **_variables)
- def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
- """xslt(self, _xslt, extensions=None, access_control=None, **_kw)
- Transform this document using other document.
- xslt is a tree that should be XSLT
- keyword parameters are XSLT transformation parameters.
- Returns the transformed tree.
- Note: if you are going to apply the same XSLT stylesheet against
- multiple documents, it is more efficient to use the XSLT
- class directly.
- """
- self._assertHasRoot()
- style = XSLT(_xslt, extensions=extensions,
- access_control=access_control)
- return style(self, **_kw)
- def relaxng(self, relaxng):
- """relaxng(self, relaxng)
- Validate this document using other document.
- The relaxng argument is a tree that should contain a Relax NG schema.
- Returns True or False, depending on whether validation
- succeeded.
- Note: if you are going to apply the same Relax NG schema against
- multiple documents, it is more efficient to use the RelaxNG
- class directly.
- """
- self._assertHasRoot()
- schema = RelaxNG(relaxng)
- return schema.validate(self)
- def xmlschema(self, xmlschema):
- """xmlschema(self, xmlschema)
- Validate this document using other document.
- The xmlschema argument is a tree that should contain an XML Schema.
- Returns True or False, depending on whether validation
- succeeded.
- Note: If you are going to apply the same XML Schema against
- multiple documents, it is more efficient to use the XMLSchema
- class directly.
- """
- self._assertHasRoot()
- schema = XMLSchema(xmlschema)
- return schema.validate(self)
- def xinclude(self):
- """xinclude(self)
- Process the XInclude nodes in this document and include the
- referenced XML fragments.
- There is support for loading files through the file system, HTTP and
- FTP.
- Note that XInclude does not support custom resolvers in Python space
- due to restrictions of libxml2 <= 2.6.29.
- """
- self._assertHasRoot()
- XInclude()(self._context_node)
- def write_c14n(self, file, *, bint exclusive=False, bint with_comments=True,
- compression=0, inclusive_ns_prefixes=None):
- """write_c14n(self, file, exclusive=False, with_comments=True,
- compression=0, inclusive_ns_prefixes=None)
- C14N write of document. Always writes UTF-8.
- The ``compression`` option enables GZip compression level 1-9.
- The ``inclusive_ns_prefixes`` should be a list of namespace strings
- (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
- during exclusive C14N serialisation. This parameter is ignored if
- exclusive mode=False.
- If exclusive=True and no list is provided, a namespace will only be
- rendered if it is used by the immediate parent or one of its attributes
- and its prefix and values have not already been rendered by an ancestor
- of the namespace node's parent element.
- NOTE: This method is deprecated as of lxml 4.4 and will be removed in a
- future release. Use ``.write(f, method="c14n")`` instead.
- """
- self._assertHasRoot()
- _assertValidNode(self._context_node)
- if compression is None or compression < 0:
- compression = 0
- _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
- compression, inclusive_ns_prefixes)
- cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node):
- return _newElementTree(doc, context_node, _ElementTree)
- cdef _ElementTree _newElementTree(_Document doc, _Element context_node,
- object baseclass):
- cdef _ElementTree result
- result = baseclass()
- if context_node is None and doc is not None:
- context_node = doc.getroot()
- if context_node is None:
- _assertValidDoc(doc)
- result._doc = doc
- else:
- _assertValidNode(context_node)
- result._context_node = context_node
- return result
- @cython.final
- @cython.freelist(16)
- cdef class _Attrib:
- """A dict-like proxy for the ``Element.attrib`` property.
- """
- cdef _Element _element
- def __cinit__(self, _Element element not None):
- _assertValidNode(element)
- self._element = element
- # MANIPULATORS
- def __setitem__(self, key, value):
- _assertValidNode(self._element)
- _setAttributeValue(self._element, key, value)
- def __delitem__(self, key):
- _assertValidNode(self._element)
- _delAttribute(self._element, key)
- def update(self, sequence_or_dict):
- _assertValidNode(self._element)
- if isinstance(sequence_or_dict, (dict, _Attrib)):
- sequence_or_dict = sequence_or_dict.items()
- for key, value in sequence_or_dict:
- _setAttributeValue(self._element, key, value)
- def pop(self, key, *default):
- if len(default) > 1:
- raise TypeError, f"pop expected at most 2 arguments, got {len(default)+1}"
- _assertValidNode(self._element)
- result = _getAttributeValue(self._element, key, None)
- if result is None:
- if not default:
- raise KeyError, key
- result = default[0]
- else:
- _delAttribute(self._element, key)
- return result
- def clear(self):
- _assertValidNode(self._element)
- c_attrs = self._element._c_node.properties
- if c_attrs:
- self._element._c_node.properties = NULL
- tree.xmlFreePropList(c_attrs)
- # ACCESSORS
- def __repr__(self):
- _assertValidNode(self._element)
- return repr(dict( _collectAttributes(self._element._c_node, 3) ))
- def __copy__(self):
- _assertValidNode(self._element)
- return dict(_collectAttributes(self._element._c_node, 3))
- def __deepcopy__(self, memo):
- _assertValidNode(self._element)
- return dict(_collectAttributes(self._element._c_node, 3))
- def __getitem__(self, key):
- _assertValidNode(self._element)
- result = _getAttributeValue(self._element, key, None)
- if result is None:
- raise KeyError, key
- return result
- def __bool__(self):
- _assertValidNode(self._element)
- cdef xmlAttr* c_attr = self._element._c_node.properties
- while c_attr is not NULL:
- if c_attr.type == tree.XML_ATTRIBUTE_NODE:
- return 1
- c_attr = c_attr.next
- return 0
- def __len__(self):
- _assertValidNode(self._element)
- cdef xmlAttr* c_attr = self._element._c_node.properties
- cdef Py_ssize_t c = 0
- while c_attr is not NULL:
- if c_attr.type == tree.XML_ATTRIBUTE_NODE:
- c += 1
- c_attr = c_attr.next
- return c
- def get(self, key, default=None):
- _assertValidNode(self._element)
- return _getAttributeValue(self._element, key, default)
- def keys(self):
- _assertValidNode(self._element)
- return _collectAttributes(self._element._c_node, 1)
- def __iter__(self):
- _assertValidNode(self._element)
- return iter(_collectAttributes(self._element._c_node, 1))
- def iterkeys(self):
- _assertValidNode(self._element)
- return iter(_collectAttributes(self._element._c_node, 1))
- def values(self):
- _assertValidNode(self._element)
- return _collectAttributes(self._element._c_node, 2)
- def itervalues(self):
- _assertValidNode(self._element)
- return iter(_collectAttributes(self._element._c_node, 2))
- def items(self):
- _assertValidNode(self._element)
- return _collectAttributes(self._element._c_node, 3)
- def iteritems(self):
- _assertValidNode(self._element)
- return iter(_collectAttributes(self._element._c_node, 3))
- def has_key(self, key):
- _assertValidNode(self._element)
- return key in self
- def __contains__(self, key):
- _assertValidNode(self._element)
- cdef xmlNode* c_node
- ns, tag = _getNsTag(key)
- c_node = self._element._c_node
- c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
- return 1 if tree.xmlHasNsProp(c_node, _xcstr(tag), c_href) else 0
- def __richcmp__(self, other, int op):
- try:
- one = dict(self.items())
- if not isinstance(other, dict):
- other = dict(other)
- except (TypeError, ValueError):
- return NotImplemented
- return python.PyObject_RichCompare(one, other, op)
- MutableMapping.register(_Attrib)
- @cython.final
- @cython.internal
- cdef class _AttribIterator:
- """Attribute iterator - for internal use only!
- """
- # XML attributes must not be removed while running!
- cdef _Element _node
- cdef xmlAttr* _c_attr
- cdef int _keysvalues # 1 - keys, 2 - values, 3 - items (key, value)
- def __iter__(self):
- return self
- def __next__(self):
- cdef xmlAttr* c_attr
- if self._node is None:
- raise StopIteration
- c_attr = self._c_attr
- while c_attr is not NULL and c_attr.type != tree.XML_ATTRIBUTE_NODE:
- c_attr = c_attr.next
- if c_attr is NULL:
- self._node = None
- raise StopIteration
- self._c_attr = c_attr.next
- if self._keysvalues == 1:
- return _namespacedName(<xmlNode*>c_attr)
- elif self._keysvalues == 2:
- return _attributeValue(self._node._c_node, c_attr)
- else:
- return (_namespacedName(<xmlNode*>c_attr),
- _attributeValue(self._node._c_node, c_attr))
- cdef object _attributeIteratorFactory(_Element element, int keysvalues):
- cdef _AttribIterator attribs
- if element._c_node.properties is NULL:
- return ITER_EMPTY
- attribs = _AttribIterator()
- attribs._node = element
- attribs._c_attr = element._c_node.properties
- attribs._keysvalues = keysvalues
- return attribs
- cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
- type LxmlElementTagMatcherType ]:
- """
- Dead but public. :)
- """
- cdef object _pystrings
- cdef int _node_type
- cdef char* _href
- cdef char* _name
- cdef _initTagMatch(self, tag):
- self._href = NULL
- self._name = NULL
- if tag is None:
- self._node_type = 0
- elif tag is Comment:
- self._node_type = tree.XML_COMMENT_NODE
- elif tag is ProcessingInstruction:
- self._node_type = tree.XML_PI_NODE
- elif tag is Entity:
- self._node_type = tree.XML_ENTITY_REF_NODE
- elif tag is Element:
- self._node_type = tree.XML_ELEMENT_NODE
- else:
- self._node_type = tree.XML_ELEMENT_NODE
- self._pystrings = _getNsTag(tag)
- if self._pystrings[0] is not None:
- self._href = _cstr(self._pystrings[0])
- self._name = _cstr(self._pystrings[1])
- if self._name[0] == c'*' and self._name[1] == c'\0':
- self._name = NULL
- cdef public class _ElementIterator(_ElementTagMatcher) [
- object LxmlElementIterator, type LxmlElementIteratorType ]:
- """
- Dead but public. :)
- """
- # we keep Python references here to control GC
- cdef _Element _node
- cdef _node_to_node_function _next_element
- def __iter__(self):
- return self
- cdef void _storeNext(self, _Element node):
- cdef xmlNode* c_node
- c_node = self._next_element(node._c_node)
- while c_node is not NULL and \
- self._node_type != 0 and \
- (<tree.xmlElementType>self._node_type != c_node.type or
- not _tagMatches(c_node, <const_xmlChar*>self._href, <const_xmlChar*>self._name)):
- c_node = self._next_element(c_node)
- if c_node is NULL:
- self._node = None
- else:
- # Python ref:
- self._node = _elementFactory(node._doc, c_node)
- def __next__(self):
- cdef xmlNode* c_node
- cdef _Element current_node
- if self._node is None:
- raise StopIteration
- # Python ref:
- current_node = self._node
- self._storeNext(current_node)
- return current_node
- @cython.final
- @cython.internal
- cdef class _MultiTagMatcher:
- """
- Match an xmlNode against a list of tags.
- """
- cdef list _py_tags
- cdef qname* _cached_tags
- cdef size_t _tag_count
- cdef size_t _cached_size
- cdef _Document _cached_doc
- cdef int _node_types
- def __cinit__(self, tags):
- self._py_tags = []
- self.initTagMatch(tags)
- def __dealloc__(self):
- self._clear()
- cdef bint rejectsAll(self) noexcept:
- return not self._tag_count and not self._node_types
- cdef bint rejectsAllAttributes(self) noexcept:
- return not self._tag_count
- cdef bint matchesType(self, int node_type) noexcept:
- if node_type == tree.XML_ELEMENT_NODE and self._tag_count:
- return True
- return self._node_types & (1 << node_type)
- cdef void _clear(self) noexcept:
- cdef size_t i, count
- count = self._tag_count
- self._tag_count = 0
- if self._cached_tags:
- for i in range(count):
- cpython.ref.Py_XDECREF(self._cached_tags[i].href)
- python.lxml_free(self._cached_tags)
- self._cached_tags = NULL
- cdef initTagMatch(self, tags):
- self._cached_doc = None
- del self._py_tags[:]
- self._clear()
- if tags is None or tags == ():
- # no selection in tags argument => match anything
- self._node_types = (
- 1 << tree.XML_COMMENT_NODE |
- 1 << tree.XML_PI_NODE |
- 1 << tree.XML_ENTITY_REF_NODE |
- 1 << tree.XML_ELEMENT_NODE)
- else:
- self._node_types = 0
- self._storeTags(tags, set())
- cdef _storeTags(self, tag, set seen):
- if tag is Comment:
- self._node_types |= 1 << tree.XML_COMMENT_NODE
- elif tag is ProcessingInstruction:
- self._node_types |= 1 << tree.XML_PI_NODE
- elif tag is Entity:
- self._node_types |= 1 << tree.XML_ENTITY_REF_NODE
- elif tag is Element:
- self._node_types |= 1 << tree.XML_ELEMENT_NODE
- elif python._isString(tag):
- if tag in seen:
- return
- seen.add(tag)
- if tag in ('*', '{*}*'):
- self._node_types |= 1 << tree.XML_ELEMENT_NODE
- else:
- href, name = _getNsTag(tag)
- if name == b'*':
- name = None
- if href is None:
- href = b'' # no namespace
- elif href == b'*':
- href = None # wildcard: any namespace, including none
- self._py_tags.append((href, name))
- elif isinstance(tag, QName):
- self._storeTags(tag.text, seen)
- else:
- # support a sequence of tags
- for item in tag:
- self._storeTags(item, seen)
- cdef inline int cacheTags(self, _Document doc, bint force_into_dict=False) except -1:
- """
- Look up the tag names in the doc dict to enable string pointer comparisons.
- """
- cdef size_t dict_size = tree.xmlDictSize(doc._c_doc.dict)
- if doc is self._cached_doc and dict_size == self._cached_size:
- # doc and dict didn't change => names already cached
- return 0
- self._tag_count = 0
- if not self._py_tags:
- self._cached_doc = doc
- self._cached_size = dict_size
- return 0
- if not self._cached_tags:
- self._cached_tags = <qname*>python.lxml_malloc(len(self._py_tags), sizeof(qname))
- if not self._cached_tags:
- self._cached_doc = None
- raise MemoryError()
- self._tag_count = <size_t>_mapTagsToQnameMatchArray(
- doc._c_doc, self._py_tags, self._cached_tags, force_into_dict)
- self._cached_doc = doc
- self._cached_size = dict_size
- return 0
- cdef inline bint matches(self, xmlNode* c_node) noexcept:
- cdef qname* c_qname
- if self._node_types & (1 << c_node.type):
- return True
- elif c_node.type == tree.XML_ELEMENT_NODE:
- for c_qname in self._cached_tags[:self._tag_count]:
- if _tagMatchesExactly(c_node, c_qname):
- return True
- return False
- cdef inline bint matchesNsTag(self, const_xmlChar* c_href,
- const_xmlChar* c_name) noexcept:
- cdef qname* c_qname
- if self._node_types & (1 << tree.XML_ELEMENT_NODE):
- return True
- for c_qname in self._cached_tags[:self._tag_count]:
- if _nsTagMatchesExactly(c_href, c_name, c_qname):
- return True
- return False
- cdef inline bint matchesAttribute(self, xmlAttr* c_attr) noexcept:
- """Attribute matches differ from Element matches in that they do
- not care about node types.
- """
- cdef qname* c_qname
- for c_qname in self._cached_tags[:self._tag_count]:
- if _tagMatchesExactly(<xmlNode*>c_attr, c_qname):
- return True
- return False
- cdef class _ElementMatchIterator:
- cdef _Element _node
- cdef _node_to_node_function _next_element
- cdef _MultiTagMatcher _matcher
- @cython.final
- cdef _initTagMatcher(self, tags):
- self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tags)
- def __iter__(self):
- return self
- @cython.final
- cdef int _storeNext(self, _Element node) except -1:
- self._matcher.cacheTags(node._doc)
- c_node = self._next_element(node._c_node)
- while c_node is not NULL and not self._matcher.matches(c_node):
- c_node = self._next_element(c_node)
- # store Python ref to next node to make sure it's kept alive
- self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
- return 0
- def __next__(self):
- cdef _Element current_node = self._node
- if current_node is None:
- raise StopIteration
- self._storeNext(current_node)
- return current_node
- cdef class ElementChildIterator(_ElementMatchIterator):
- """ElementChildIterator(self, node, tag=None, reversed=False)
- Iterates over the children of an element.
- """
- def __cinit__(self, _Element node not None, tag=None, *, bint reversed=False):
- cdef xmlNode* c_node
- _assertValidNode(node)
- self._initTagMatcher(tag)
- if reversed:
- c_node = _findChildBackwards(node._c_node, 0)
- self._next_element = _previousElement
- else:
- c_node = _findChildForwards(node._c_node, 0)
- self._next_element = _nextElement
- self._matcher.cacheTags(node._doc)
- while c_node is not NULL and not self._matcher.matches(c_node):
- c_node = self._next_element(c_node)
- # store Python ref to next node to make sure it's kept alive
- self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
- cdef class SiblingsIterator(_ElementMatchIterator):
- """SiblingsIterator(self, node, tag=None, preceding=False)
- Iterates over the siblings of an element.
- You can pass the boolean keyword ``preceding`` to specify the direction.
- """
- def __cinit__(self, _Element node not None, tag=None, *, bint preceding=False):
- _assertValidNode(node)
- self._initTagMatcher(tag)
- if preceding:
- self._next_element = _previousElement
- else:
- self._next_element = _nextElement
- self._storeNext(node)
- cdef class AncestorsIterator(_ElementMatchIterator):
- """AncestorsIterator(self, node, tag=None)
- Iterates over the ancestors of an element (from parent to parent).
- """
- def __cinit__(self, _Element node not None, tag=None):
- _assertValidNode(node)
- self._initTagMatcher(tag)
- self._next_element = _parentElement
- self._storeNext(node)
- cdef class ElementDepthFirstIterator:
- """ElementDepthFirstIterator(self, node, tag=None, inclusive=True)
- Iterates over an element and its sub-elements in document order (depth
- first pre-order).
- Note that this also includes comments, entities and processing
- instructions. To filter them out, check if the ``tag`` property
- of the returned element is a string (i.e. not None and not a
- factory function), or pass the ``Element`` factory for the ``tag``
- argument to receive only Elements.
- If the optional ``tag`` argument is not None, the iterator returns only
- the elements that match the respective name and namespace.
- The optional boolean argument 'inclusive' defaults to True and can be set
- to False to exclude the start element itself.
- Note that the behaviour of this iterator is completely undefined if the
- tree it traverses is modified during iteration.
- """
- # we keep Python references here to control GC
- # keep the next Element after the one we return, and the (s)top node
- cdef _Element _next_node
- cdef _Element _top_node
- cdef _MultiTagMatcher _matcher
- def __cinit__(self, _Element node not None, tag=None, *, bint inclusive=True):
- _assertValidNode(node)
- self._top_node = node
- self._next_node = node
- self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
- self._matcher.cacheTags(node._doc)
- if not inclusive or not self._matcher.matches(node._c_node):
- # find start node (this cannot raise StopIteration, self._next_node != None)
- next(self)
- def __iter__(self):
- return self
- def __next__(self):
- cdef xmlNode* c_node
- cdef _Element current_node = self._next_node
- if current_node is None:
- raise StopIteration
- c_node = current_node._c_node
- self._matcher.cacheTags(current_node._doc)
- if not self._matcher._tag_count:
- # no tag name was found in the dict => not in document either
- # try to match by node type
- c_node = self._nextNodeAnyTag(c_node)
- else:
- c_node = self._nextNodeMatchTag(c_node)
- if c_node is NULL:
- self._next_node = None
- else:
- self._next_node = _elementFactory(current_node._doc, c_node)
- return current_node
- @cython.final
- cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node) noexcept:
- cdef int node_types = self._matcher._node_types
- if not node_types:
- return NULL
- tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
- if node_types & (1 << c_node.type):
- return c_node
- tree.END_FOR_EACH_ELEMENT_FROM(c_node)
- return NULL
- @cython.final
- cdef xmlNode* _nextNodeMatchTag(self, xmlNode* c_node) noexcept:
- tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
- if self._matcher.matches(c_node):
- return c_node
- tree.END_FOR_EACH_ELEMENT_FROM(c_node)
- return NULL
- cdef class ElementTextIterator:
- """ElementTextIterator(self, element, tag=None, with_tail=True)
- Iterates over the text content of a subtree.
- You can pass the ``tag`` keyword argument to restrict text content to a
- specific tag name.
- You can set the ``with_tail`` keyword argument to ``False`` to skip over
- tail text (e.g. if you know that it's only whitespace from pretty-printing).
- """
- cdef object _events
- cdef _Element _start_element
- def __cinit__(self, _Element element not None, tag=None, *, bint with_tail=True):
- _assertValidNode(element)
- if with_tail:
- events = ("start", "comment", "pi", "end")
- else:
- events = ("start",)
- self._start_element = element
- self._events = iterwalk(element, events=events, tag=tag)
- def __iter__(self):
- return self
- def __next__(self):
- cdef _Element element
- result = None
- while result is None:
- event, element = next(self._events) # raises StopIteration
- if event == "start":
- result = element.text
- elif element is not self._start_element:
- result = element.tail
- return result
- cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL:
- cdef xmlNode* c_node
- c_node = tree.xmlNewDocNode(c_doc, NULL, _xcstr(name_utf), NULL)
- return c_node
- cdef xmlNode* _createComment(xmlDoc* c_doc, const_xmlChar* text) noexcept:
- cdef xmlNode* c_node
- c_node = tree.xmlNewDocComment(c_doc, text)
- return c_node
- cdef xmlNode* _createPI(xmlDoc* c_doc, const_xmlChar* target, const_xmlChar* text) noexcept:
- cdef xmlNode* c_node
- c_node = tree.xmlNewDocPI(c_doc, target, text)
- return c_node
- cdef xmlNode* _createEntity(xmlDoc* c_doc, const_xmlChar* name) noexcept:
- cdef xmlNode* c_node
- c_node = tree.xmlNewReference(c_doc, name)
- return c_node
- # module-level API for ElementTree
- from abc import ABC
- class Element(ABC):
- """Element(_tag, attrib=None, nsmap=None, **_extra)
- Element factory, as a class.
- An instance of this class is an object implementing the
- Element interface.
- >>> element = Element("test")
- >>> type(element)
- <class 'lxml.etree._Element'>
- >>> isinstance(element, Element)
- True
- >>> issubclass(_Element, Element)
- True
- Also look at the `_Element.makeelement()` and
- `_BaseParser.makeelement()` methods, which provide a faster way to
- create an Element within a specific document or parser context.
- """
- def __new__(cls, _tag, attrib=None, nsmap=None, **_extra):
- return _makeElement(_tag, NULL, None, None, None, None,
- attrib, nsmap, _extra)
- # Register _Element as a virtual subclass of Element
- Element.register(_Element)
- def Comment(text=None):
- """Comment(text=None)
- Comment element factory. This factory function creates a special element that will
- be serialized as an XML comment.
- """
- cdef _Document doc
- cdef xmlNode* c_node
- cdef xmlDoc* c_doc
- if text is None:
- text = b''
- else:
- text = _utf8(text)
- if b'--' in text or text.endswith(b'-'):
- raise ValueError("Comment may not contain '--' or end with '-'")
- c_doc = _newXMLDoc()
- doc = _documentFactory(c_doc, None)
- c_node = _createComment(c_doc, _xcstr(text))
- tree.xmlAddChild(<xmlNode*>c_doc, c_node)
- return _elementFactory(doc, c_node)
- def ProcessingInstruction(target, text=None):
- """ProcessingInstruction(target, text=None)
- ProcessingInstruction element factory. This factory function creates a
- special element that will be serialized as an XML processing instruction.
- """
- cdef _Document doc
- cdef xmlNode* c_node
- cdef xmlDoc* c_doc
- target = _utf8(target)
- _tagValidOrRaise(target)
- if target.lower() == b'xml':
- raise ValueError, f"Invalid PI name '{target}'"
- if text is None:
- text = b''
- else:
- text = _utf8(text)
- if b'?>' in text:
- raise ValueError, "PI text must not contain '?>'"
- c_doc = _newXMLDoc()
- doc = _documentFactory(c_doc, None)
- c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
- tree.xmlAddChild(<xmlNode*>c_doc, c_node)
- return _elementFactory(doc, c_node)
- PI = ProcessingInstruction
- cdef class CDATA:
- """CDATA(data)
- CDATA factory. This factory creates an opaque data object that
- can be used to set Element text. The usual way to use it is::
- >>> el = Element('content')
- >>> el.text = CDATA('a string')
- >>> print(el.text)
- a string
- >>> print(tostring(el, encoding="unicode"))
- <content><![CDATA[a string]]></content>
- """
- cdef bytes _utf8_data
- def __cinit__(self, data):
- self._utf8_data = _utf8(data)
- def Entity(name):
- """Entity(name)
- Entity factory. This factory function creates a special element
- that will be serialized as an XML entity reference or character
- reference. Note, however, that entities will not be automatically
- declared in the document. A document that uses entity references
- requires a DTD to define the entities.
- """
- cdef _Document doc
- cdef xmlNode* c_node
- cdef xmlDoc* c_doc
- name_utf = _utf8(name)
- c_name = _xcstr(name_utf)
- if c_name[0] == c'#':
- if not _characterReferenceIsValid(c_name + 1):
- raise ValueError, f"Invalid character reference: '{name}'"
- elif not _xmlNameIsValid(c_name):
- raise ValueError, f"Invalid entity reference: '{name}'"
- c_doc = _newXMLDoc()
- doc = _documentFactory(c_doc, None)
- c_node = _createEntity(c_doc, c_name)
- tree.xmlAddChild(<xmlNode*>c_doc, c_node)
- return _elementFactory(doc, c_node)
- def SubElement(_Element _parent not None, _tag,
- attrib=None, nsmap=None, **_extra):
- """SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
- Subelement factory. This function creates an element instance, and
- appends it to an existing element.
- """
- return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra)
- from typing import Generic, TypeVar
- T = TypeVar("T")
- class ElementTree(ABC, Generic[T]):
- def __new__(cls, _Element element=None, *, file=None, _BaseParser parser=None):
- """ElementTree(element=None, file=None, parser=None)
- ElementTree wrapper class.
- """
- cdef xmlNode* c_next
- cdef xmlNode* c_node
- cdef xmlNode* c_node_copy
- cdef xmlDoc* c_doc
- cdef _ElementTree etree
- cdef _Document doc
- if element is not None:
- doc = element._doc
- elif file is not None:
- try:
- doc = _parseDocument(file, parser, None)
- except _TargetParserResult as result_container:
- return result_container.result
- else:
- c_doc = _newXMLDoc()
- doc = _documentFactory(c_doc, parser)
- return _elementTreeFactory(doc, element)
- # Register _ElementTree as a virtual subclass of ElementTree
- ElementTree.register(_ElementTree)
- # Remove "ABC" and typing helpers from module dict
- del ABC, Generic, TypeVar, T
- def HTML(text, _BaseParser parser=None, *, base_url=None):
- """HTML(text, parser=None, base_url=None)
- Parses an HTML document from a string constant. Returns the root
- node (or the result returned by a parser target). This function
- can be used to embed "HTML literals" in Python code.
- To override the parser with a different ``HTMLParser`` you can pass it to
- the ``parser`` keyword argument.
- The ``base_url`` keyword argument allows to set the original base URL of
- the document to support relative Paths when looking up external entities
- (DTD, XInclude, ...).
- """
- cdef _Document doc
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- if not isinstance(parser, HTMLParser):
- parser = __DEFAULT_HTML_PARSER
- try:
- doc = _parseMemoryDocument(text, base_url, parser)
- return doc.getroot()
- except _TargetParserResult as result_container:
- return result_container.result
- def XML(text, _BaseParser parser=None, *, base_url=None):
- """XML(text, parser=None, base_url=None)
- Parses an XML document or fragment from a string constant.
- Returns the root node (or the result returned by a parser target).
- This function can be used to embed "XML literals" in Python code,
- like in
- >>> root = XML("<root><test/></root>")
- >>> print(root.tag)
- root
- To override the parser with a different ``XMLParser`` you can pass it to
- the ``parser`` keyword argument.
- The ``base_url`` keyword argument allows to set the original base URL of
- the document to support relative Paths when looking up external entities
- (DTD, XInclude, ...).
- """
- cdef _Document doc
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- if not isinstance(parser, XMLParser):
- parser = __DEFAULT_XML_PARSER
- try:
- doc = _parseMemoryDocument(text, base_url, parser)
- return doc.getroot()
- except _TargetParserResult as result_container:
- return result_container.result
- def fromstring(text, _BaseParser parser=None, *, base_url=None):
- """fromstring(text, parser=None, base_url=None)
- Parses an XML document or fragment from a string. Returns the
- root node (or the result returned by a parser target).
- To override the default parser with a different parser you can pass it to
- the ``parser`` keyword argument.
- The ``base_url`` keyword argument allows to set the original base URL of
- the document to support relative Paths when looking up external entities
- (DTD, XInclude, ...).
- """
- cdef _Document doc
- try:
- doc = _parseMemoryDocument(text, base_url, parser)
- return doc.getroot()
- except _TargetParserResult as result_container:
- return result_container.result
- def fromstringlist(strings, _BaseParser parser=None):
- """fromstringlist(strings, parser=None)
- Parses an XML document from a sequence of strings. Returns the
- root node (or the result returned by a parser target).
- To override the default parser with a different parser you can pass it to
- the ``parser`` keyword argument.
- """
- cdef _Document doc
- if isinstance(strings, (bytes, unicode)):
- raise ValueError("passing a single string into fromstringlist() is not"
- " efficient, use fromstring() instead")
- if parser is None:
- parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- feed = parser.feed
- for data in strings:
- feed(data)
- return parser.close()
- def iselement(element):
- """iselement(element)
- Checks if an object appears to be a valid element object.
- """
- return isinstance(element, _Element) and (<_Element>element)._c_node is not NULL
- def indent(tree, space=" ", *, Py_ssize_t level=0):
- """indent(tree, space=" ", level=0)
- Indent an XML document by inserting newlines and indentation space
- after elements.
- *tree* is the ElementTree or Element to modify. The (root) element
- itself will not be changed, but the tail text of all elements in its
- subtree will be adapted.
- *space* is the whitespace to insert for each indentation level, two
- space characters by default.
- *level* is the initial indentation level. Setting this to a higher
- value than 0 can be used for indenting subtrees that are more deeply
- nested inside of a document.
- """
- root = _rootNodeOrRaise(tree)
- if level < 0:
- raise ValueError(f"Initial indentation level must be >= 0, got {level}")
- if _hasChild(root._c_node):
- space = _utf8(space)
- indent = b"\n" + level * space
- _indent_children(root._c_node, 1, space, [indent, indent + space])
- cdef int _indent_children(xmlNode* c_node, Py_ssize_t level, bytes one_space, list indentations) except -1:
- # Reuse indentation strings for speed.
- if len(indentations) <= level:
- indentations.append(indentations[-1] + one_space)
- # Start a new indentation level for the first child.
- child_indentation = indentations[level]
- if not _hasNonWhitespaceText(c_node):
- _setNodeText(c_node, child_indentation)
- # Recursively indent all children.
- cdef xmlNode* c_child = _findChildForwards(c_node, 0)
- while c_child is not NULL:
- if _hasChild(c_child):
- _indent_children(c_child, level+1, one_space, indentations)
- c_next_child = _nextElement(c_child)
- if not _hasNonWhitespaceTail(c_child):
- if c_next_child is NULL:
- # Dedent after the last child.
- child_indentation = indentations[level-1]
- _setTailText(c_child, child_indentation)
- c_child = c_next_child
- return 0
- def dump(_Element elem not None, *, bint pretty_print=True, bint with_tail=True):
- """dump(elem, pretty_print=True, with_tail=True)
- Writes an element tree or element structure to sys.stdout. This function
- should be used for debugging only.
- """
- xml = tostring(elem, pretty_print=pretty_print, with_tail=with_tail, encoding='unicode')
- if not pretty_print:
- xml += '\n'
- sys.stdout.write(xml)
- def tostring(element_or_tree, *, encoding=None, method="xml",
- xml_declaration=None, bint pretty_print=False, bint with_tail=True,
- standalone=None, doctype=None,
- # method='c14n'
- bint exclusive=False, inclusive_ns_prefixes=None,
- # method='c14n2'
- bint with_comments=True, bint strip_text=False,
- ):
- """tostring(element_or_tree, encoding=None, method="xml",
- xml_declaration=None, pretty_print=False, with_tail=True,
- standalone=None, doctype=None,
- exclusive=False, inclusive_ns_prefixes=None,
- with_comments=True, strip_text=False,
- )
- Serialize an element to an encoded string representation of its XML
- tree.
- Defaults to ASCII encoding without XML declaration. This
- behaviour can be configured with the keyword arguments 'encoding'
- (string) and 'xml_declaration' (bool). Note that changing the
- encoding to a non UTF-8 compatible encoding will enable a
- declaration by default.
- You can also serialise to a Unicode string without declaration by
- passing the name ``'unicode'`` as encoding (or the ``str`` function
- in Py3 or ``unicode`` in Py2). This changes the return value from
- a byte string to an unencoded unicode string.
- The keyword argument 'pretty_print' (bool) enables formatted XML.
- The keyword argument 'method' selects the output method: 'xml',
- 'html', plain 'text' (text content without tags), 'c14n' or 'c14n2'.
- Default is 'xml'.
- With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
- ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
- C14N, include comments, and list the inclusive prefixes respectively.
- With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
- ``strip_text`` options control the output of comments and text space
- according to C14N 2.0.
- Passing a boolean value to the ``standalone`` option will output
- an XML declaration with the corresponding ``standalone`` flag.
- The ``doctype`` option allows passing in a plain string that will
- be serialised before the XML tree. Note that passing in non
- well-formed content here will make the XML output non well-formed.
- Also, an existing doctype in the document tree will not be removed
- when serialising an ElementTree instance.
- You can prevent the tail text of the element from being serialised
- by passing the boolean ``with_tail`` option. This has no impact
- on the tail text of children, which will always be serialised.
- """
- cdef bint write_declaration
- cdef int is_standalone
- # C14N serialisation
- if method in ('c14n', 'c14n2'):
- if encoding is not None:
- raise ValueError("Cannot specify encoding with C14N")
- if xml_declaration:
- raise ValueError("Cannot enable XML declaration in C14N")
- if method == 'c14n':
- return _tostringC14N(element_or_tree, exclusive, with_comments, inclusive_ns_prefixes)
- else:
- out = BytesIO()
- target = C14NWriterTarget(
- utf8_writer(out).write,
- with_comments=with_comments, strip_text=strip_text)
- _tree_to_target(element_or_tree, target)
- return out.getvalue()
- if not with_comments:
- raise ValueError("Can only discard comments in C14N serialisation")
- if strip_text:
- raise ValueError("Can only strip text in C14N 2.0 serialisation")
- if encoding is unicode or (encoding is not None and encoding.lower() == 'unicode'):
- if xml_declaration:
- raise ValueError, \
- "Serialisation to unicode must not request an XML declaration"
- write_declaration = 0
- encoding = unicode
- elif xml_declaration is None:
- # by default, write an XML declaration only for non-standard encodings
- write_declaration = encoding is not None and encoding.upper() not in \
- ('ASCII', 'UTF-8', 'UTF8', 'US-ASCII')
- else:
- write_declaration = xml_declaration
- if encoding is None:
- encoding = 'ASCII'
- if standalone is None:
- is_standalone = -1
- elif standalone:
- write_declaration = 1
- is_standalone = 1
- else:
- write_declaration = 1
- is_standalone = 0
- if isinstance(element_or_tree, _Element):
- return _tostring(<_Element>element_or_tree, encoding, doctype, method,
- write_declaration, 0, pretty_print, with_tail,
- is_standalone)
- elif isinstance(element_or_tree, _ElementTree):
- return _tostring((<_ElementTree>element_or_tree)._context_node,
- encoding, doctype, method, write_declaration, 1,
- pretty_print, with_tail, is_standalone)
- else:
- raise TypeError, f"Type '{python._fqtypename(element_or_tree).decode('utf8')}' cannot be serialized."
- def tostringlist(element_or_tree, *args, **kwargs):
- """tostringlist(element_or_tree, *args, **kwargs)
- Serialize an element to an encoded string representation of its XML
- tree, stored in a list of partial strings.
- This is purely for ElementTree 1.3 compatibility. The result is a
- single string wrapped in a list.
- """
- return [tostring(element_or_tree, *args, **kwargs)]
- def tounicode(element_or_tree, *, method="xml", bint pretty_print=False,
- bint with_tail=True, doctype=None):
- """tounicode(element_or_tree, method="xml", pretty_print=False,
- with_tail=True, doctype=None)
- Serialize an element to the Python unicode representation of its XML
- tree.
- :deprecated: use ``tostring(el, encoding='unicode')`` instead.
- Note that the result does not carry an XML encoding declaration and is
- therefore not necessarily suited for serialization to byte streams without
- further treatment.
- The boolean keyword argument 'pretty_print' enables formatted XML.
- The keyword argument 'method' selects the output method: 'xml',
- 'html' or plain 'text'.
- You can prevent the tail text of the element from being serialised
- by passing the boolean ``with_tail`` option. This has no impact
- on the tail text of children, which will always be serialised.
- """
- if isinstance(element_or_tree, _Element):
- return _tostring(<_Element>element_or_tree, unicode, doctype, method,
- 0, 0, pretty_print, with_tail, -1)
- elif isinstance(element_or_tree, _ElementTree):
- return _tostring((<_ElementTree>element_or_tree)._context_node,
- unicode, doctype, method, 0, 1, pretty_print,
- with_tail, -1)
- else:
- raise TypeError, f"Type '{type(element_or_tree)}' cannot be serialized."
- def parse(source, _BaseParser parser=None, *, base_url=None):
- """parse(source, parser=None, base_url=None)
- Return an ElementTree object loaded with source elements. If no parser
- is provided as second argument, the default parser is used.
- The ``source`` can be any of the following:
- - a file name/path
- - a file object
- - a file-like object
- - a URL using the HTTP or FTP protocol
- To parse from a string, use the ``fromstring()`` function instead.
- Note that it is generally faster to parse from a file path or URL
- than from an open file object or file-like object. Transparent
- decompression from gzip compressed sources is supported (unless
- explicitly disabled in libxml2).
- The ``base_url`` keyword allows setting a URL for the document
- when parsing from a file-like object. This is needed when looking
- up external entities (DTD, XInclude, ...) with relative paths.
- """
- cdef _Document doc
- try:
- doc = _parseDocument(source, parser, base_url)
- return _elementTreeFactory(doc, None)
- except _TargetParserResult as result_container:
- return result_container.result
- def adopt_external_document(capsule, _BaseParser parser=None):
- """adopt_external_document(capsule, parser=None)
- Unpack a libxml2 document pointer from a PyCapsule and wrap it in an
- lxml ElementTree object.
- This allows external libraries to build XML/HTML trees using libxml2
- and then pass them efficiently into lxml for further processing.
- If a ``parser`` is provided, it will be used for configuring the
- lxml document. No parsing will be done.
- The capsule must have the name ``"libxml2:xmlDoc"`` and its pointer
- value must reference a correct libxml2 document of type ``xmlDoc*``.
- The creator of the capsule must take care to correctly clean up the
- document using an appropriate capsule destructor. By default, the
- libxml2 document will be copied to let lxml safely own the memory
- of the internal tree that it uses.
- If the capsule context is non-NULL, it must point to a C string that
- can be compared using ``strcmp()``. If the context string equals
- ``"destructor:xmlFreeDoc"``, the libxml2 document will not be copied
- but the capsule invalidated instead by clearing its destructor and
- name. That way, lxml takes ownership of the libxml2 document in memory
- without creating a copy first, and the capsule destructor will not be
- called. The document will then eventually be cleaned up by lxml using
- the libxml2 API function ``xmlFreeDoc()`` once it is no longer used.
- If no copy is made, later modifications of the tree outside of lxml
- should not be attempted after transferring the ownership.
- """
- cdef xmlDoc* c_doc
- cdef bint is_owned = False
- c_doc = <xmlDoc*> python.lxml_unpack_xmldoc_capsule(capsule, &is_owned)
- doc = _adoptForeignDoc(c_doc, parser, is_owned)
- return _elementTreeFactory(doc, None)
- ################################################################################
- # Include submodules
- include "readonlytree.pxi" # Read-only implementation of Element proxies
- include "classlookup.pxi" # Element class lookup mechanisms
- include "nsclasses.pxi" # Namespace implementation and registry
- include "docloader.pxi" # Support for custom document loaders
- include "parser.pxi" # XML and HTML parsers
- include "saxparser.pxi" # SAX-like Parser interface and tree builder
- include "parsertarget.pxi" # ET Parser target
- include "serializer.pxi" # XML output functions
- include "iterparse.pxi" # incremental XML parsing
- include "xmlid.pxi" # XMLID and IDDict
- include "xinclude.pxi" # XInclude
- include "cleanup.pxi" # Cleanup and recursive element removal functions
- ################################################################################
- # Include submodules for XPath and XSLT
- include "extensions.pxi" # XPath/XSLT extension functions
- include "xpath.pxi" # XPath evaluation
- include "xslt.pxi" # XSL transformations
- include "xsltext.pxi" # XSL extension elements
- ################################################################################
- # Validation
- cdef class DocumentInvalid(LxmlError):
- """Validation error.
- Raised by all document validators when their ``assertValid(tree)``
- method fails.
- """
- cdef class _Validator:
- "Base class for XML validators."
- cdef _ErrorLog _error_log
- def __cinit__(self):
- self._error_log = _ErrorLog()
- def validate(self, etree):
- """validate(self, etree)
- Validate the document using this schema.
- Returns true if document is valid, false if not.
- """
- return self(etree)
- def assertValid(self, etree):
- """assertValid(self, etree)
- Raises `DocumentInvalid` if the document does not comply with the schema.
- """
- if not self(etree):
- raise DocumentInvalid(self._error_log._buildExceptionMessage(
- "Document does not comply with schema"),
- self._error_log)
- def assert_(self, etree):
- """assert_(self, etree)
- Raises `AssertionError` if the document does not comply with the schema.
- """
- if not self(etree):
- raise AssertionError, self._error_log._buildExceptionMessage(
- "Document does not comply with schema")
- cpdef _append_log_message(self, int domain, int type, int level, int line,
- message, filename):
- self._error_log._receiveGeneric(domain, type, level, line, message,
- filename)
- cpdef _clear_error_log(self):
- self._error_log.clear()
- @property
- def error_log(self):
- """The log of validation errors and warnings."""
- assert self._error_log is not None, "XPath evaluator not initialised"
- return self._error_log.copy()
- include "dtd.pxi" # DTD
- include "relaxng.pxi" # RelaxNG
- include "xmlschema.pxi" # XMLSchema
- include "schematron.pxi" # Schematron (requires libxml2 2.6.21+)
- ################################################################################
- # Public C API
- include "public-api.pxi"
- ################################################################################
- # Other stuff
- include "debug.pxi"
|