sax.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. """
  2. SAX-based adapter to copy trees from/to the Python standard library.
  3. Use the `ElementTreeContentHandler` class to build an ElementTree from
  4. SAX events.
  5. Use the `ElementTreeProducer` class or the `saxify()` function to fire
  6. the SAX events of an ElementTree against a SAX ContentHandler.
  7. See https://lxml.de/sax.html
  8. """
  9. from xml.sax.handler import ContentHandler
  10. from lxml import etree
  11. from lxml.etree import ElementTree, SubElement
  12. from lxml.etree import Comment, ProcessingInstruction
  13. try:
  14. from types import GenericAlias as _GenericAlias
  15. except ImportError:
  16. # Python 3.8 - we only need this as return value from "__class_getitem__"
  17. def _GenericAlias(cls, item):
  18. return f"{cls.__name__}[{item.__name__}]"
  19. class SaxError(etree.LxmlError):
  20. """General SAX error.
  21. """
  22. def _getNsTag(tag):
  23. if tag[0] == '{' and '}' in tag:
  24. return tuple(tag[1:].split('}', 1))
  25. else:
  26. return None, tag
  27. class ElementTreeContentHandler(ContentHandler):
  28. """Build an lxml ElementTree from SAX events.
  29. """
  30. def __init__(self, makeelement=None):
  31. ContentHandler.__init__(self)
  32. self._root = None
  33. self._root_siblings = []
  34. self._element_stack = []
  35. self._default_ns = None
  36. self._ns_mapping = { None : [None] }
  37. self._new_mappings = {}
  38. if makeelement is None:
  39. makeelement = etree.Element
  40. self._makeelement = makeelement
  41. def _get_etree(self):
  42. "Contains the generated ElementTree after parsing is finished."
  43. return ElementTree(self._root)
  44. etree = property(_get_etree, doc=_get_etree.__doc__)
  45. def setDocumentLocator(self, locator):
  46. pass
  47. def startDocument(self):
  48. pass
  49. def endDocument(self):
  50. pass
  51. def startPrefixMapping(self, prefix, uri):
  52. self._new_mappings[prefix] = uri
  53. try:
  54. self._ns_mapping[prefix].append(uri)
  55. except KeyError:
  56. self._ns_mapping[prefix] = [uri]
  57. if prefix is None:
  58. self._default_ns = uri
  59. def endPrefixMapping(self, prefix):
  60. ns_uri_list = self._ns_mapping[prefix]
  61. ns_uri_list.pop()
  62. if prefix is None:
  63. self._default_ns = ns_uri_list[-1]
  64. def _buildTag(self, ns_name_tuple):
  65. ns_uri, local_name = ns_name_tuple
  66. if ns_uri:
  67. el_tag = "{%s}%s" % ns_name_tuple
  68. elif self._default_ns:
  69. el_tag = "{%s}%s" % (self._default_ns, local_name)
  70. else:
  71. el_tag = local_name
  72. return el_tag
  73. def startElementNS(self, ns_name, qname, attributes=None):
  74. el_name = self._buildTag(ns_name)
  75. if attributes:
  76. attrs = {}
  77. try:
  78. iter_attributes = attributes.iteritems()
  79. except AttributeError:
  80. iter_attributes = attributes.items()
  81. for name_tuple, value in iter_attributes:
  82. if name_tuple[0]:
  83. attr_name = "{%s}%s" % name_tuple
  84. else:
  85. attr_name = name_tuple[1]
  86. attrs[attr_name] = value
  87. else:
  88. attrs = None
  89. element_stack = self._element_stack
  90. if self._root is None:
  91. element = self._root = \
  92. self._makeelement(el_name, attrs, self._new_mappings)
  93. if self._root_siblings and hasattr(element, 'addprevious'):
  94. for sibling in self._root_siblings:
  95. element.addprevious(sibling)
  96. del self._root_siblings[:]
  97. else:
  98. element = SubElement(element_stack[-1], el_name,
  99. attrs, self._new_mappings)
  100. element_stack.append(element)
  101. self._new_mappings.clear()
  102. def processingInstruction(self, target, data):
  103. pi = ProcessingInstruction(target, data)
  104. if self._root is None:
  105. self._root_siblings.append(pi)
  106. else:
  107. self._element_stack[-1].append(pi)
  108. def endElementNS(self, ns_name, qname):
  109. element = self._element_stack.pop()
  110. el_tag = self._buildTag(ns_name)
  111. if el_tag != element.tag:
  112. raise SaxError("Unexpected element closed: " + el_tag)
  113. def startElement(self, name, attributes=None):
  114. if attributes:
  115. attributes = {(None, k): v for k, v in attributes.items()}
  116. self.startElementNS((None, name), name, attributes)
  117. def endElement(self, name):
  118. self.endElementNS((None, name), name)
  119. def characters(self, data):
  120. last_element = self._element_stack[-1]
  121. try:
  122. # if there already is a child element, we must append to its tail
  123. last_element = last_element[-1]
  124. except IndexError:
  125. # otherwise: append to the text
  126. last_element.text = (last_element.text or '') + data
  127. else:
  128. last_element.tail = (last_element.tail or '') + data
  129. ignorableWhitespace = characters
  130. # Allow subscripting sax.ElementTreeContentHandler in type annotions (PEP 560)
  131. def __class_getitem__(cls, item):
  132. return _GenericAlias(cls, item)
  133. class ElementTreeProducer:
  134. """Produces SAX events for an element and children.
  135. """
  136. def __init__(self, element_or_tree, content_handler):
  137. try:
  138. element = element_or_tree.getroot()
  139. except AttributeError:
  140. element = element_or_tree
  141. self._element = element
  142. self._content_handler = content_handler
  143. from xml.sax.xmlreader import AttributesNSImpl as attr_class
  144. self._attr_class = attr_class
  145. self._empty_attributes = attr_class({}, {})
  146. def saxify(self):
  147. self._content_handler.startDocument()
  148. element = self._element
  149. if hasattr(element, 'getprevious'):
  150. siblings = []
  151. sibling = element.getprevious()
  152. while getattr(sibling, 'tag', None) is ProcessingInstruction:
  153. siblings.append(sibling)
  154. sibling = sibling.getprevious()
  155. for sibling in siblings[::-1]:
  156. self._recursive_saxify(sibling, {})
  157. self._recursive_saxify(element, {})
  158. if hasattr(element, 'getnext'):
  159. sibling = element.getnext()
  160. while getattr(sibling, 'tag', None) is ProcessingInstruction:
  161. self._recursive_saxify(sibling, {})
  162. sibling = sibling.getnext()
  163. self._content_handler.endDocument()
  164. def _recursive_saxify(self, element, parent_nsmap):
  165. content_handler = self._content_handler
  166. tag = element.tag
  167. if tag is Comment or tag is ProcessingInstruction:
  168. if tag is ProcessingInstruction:
  169. content_handler.processingInstruction(
  170. element.target, element.text)
  171. tail = element.tail
  172. if tail:
  173. content_handler.characters(tail)
  174. return
  175. element_nsmap = element.nsmap
  176. new_prefixes = []
  177. if element_nsmap != parent_nsmap:
  178. # There have been updates to the namespace
  179. for prefix, ns_uri in element_nsmap.items():
  180. if parent_nsmap.get(prefix) != ns_uri:
  181. new_prefixes.append( (prefix, ns_uri) )
  182. attribs = element.items()
  183. if attribs:
  184. attr_values = {}
  185. attr_qnames = {}
  186. for attr_ns_name, value in attribs:
  187. attr_ns_tuple = _getNsTag(attr_ns_name)
  188. attr_values[attr_ns_tuple] = value
  189. attr_qnames[attr_ns_tuple] = self._build_qname(
  190. attr_ns_tuple[0], attr_ns_tuple[1], element_nsmap,
  191. preferred_prefix=None, is_attribute=True)
  192. sax_attributes = self._attr_class(attr_values, attr_qnames)
  193. else:
  194. sax_attributes = self._empty_attributes
  195. ns_uri, local_name = _getNsTag(tag)
  196. qname = self._build_qname(
  197. ns_uri, local_name, element_nsmap, element.prefix, is_attribute=False)
  198. for prefix, uri in new_prefixes:
  199. content_handler.startPrefixMapping(prefix, uri)
  200. content_handler.startElementNS(
  201. (ns_uri, local_name), qname, sax_attributes)
  202. text = element.text
  203. if text:
  204. content_handler.characters(text)
  205. for child in element:
  206. self._recursive_saxify(child, element_nsmap)
  207. content_handler.endElementNS((ns_uri, local_name), qname)
  208. for prefix, uri in new_prefixes:
  209. content_handler.endPrefixMapping(prefix)
  210. tail = element.tail
  211. if tail:
  212. content_handler.characters(tail)
  213. def _build_qname(self, ns_uri, local_name, nsmap, preferred_prefix, is_attribute):
  214. if ns_uri is None:
  215. return local_name
  216. if not is_attribute and nsmap.get(preferred_prefix) == ns_uri:
  217. prefix = preferred_prefix
  218. else:
  219. # Pick the first matching prefix, in alphabetical order.
  220. candidates = [
  221. pfx for (pfx, uri) in nsmap.items()
  222. if pfx is not None and uri == ns_uri
  223. ]
  224. prefix = (
  225. candidates[0] if len(candidates) == 1
  226. else min(candidates) if candidates
  227. else None
  228. )
  229. if prefix is None:
  230. # Default namespace
  231. return local_name
  232. return prefix + ':' + local_name
  233. def saxify(element_or_tree, content_handler):
  234. """One-shot helper to generate SAX events from an XML tree and fire
  235. them against a SAX ContentHandler.
  236. """
  237. return ElementTreeProducer(element_or_tree, content_handler).saxify()