domish.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. # -*- test-case-name: twisted.words.test.test_domish -*-
  2. # Copyright (c) Twisted Matrix Laboratories.
  3. # See LICENSE for details.
  4. """
  5. DOM-like XML processing support.
  6. This module provides support for parsing XML into DOM-like object structures
  7. and serializing such structures to an XML string representation, optimized
  8. for use in streaming XML applications.
  9. """
  10. from typing import cast
  11. from zope.interface import Attribute, Interface, implementer
  12. from twisted.web import sux
  13. def _splitPrefix(name):
  14. """Internal method for splitting a prefixed Element name into its
  15. respective parts"""
  16. ntok = name.split(":", 1)
  17. if len(ntok) == 2:
  18. return ntok
  19. else:
  20. return (None, ntok[0])
  21. # Global map of prefixes that always get injected
  22. # into the serializers prefix map (note, that doesn't
  23. # mean they're always _USED_)
  24. G_PREFIXES = {"http://www.w3.org/XML/1998/namespace": "xml"}
  25. class _ListSerializer:
  26. """Internal class which serializes an Element tree into a buffer"""
  27. def __init__(self, prefixes=None, prefixesInScope=None):
  28. self.writelist = []
  29. self.prefixes = {}
  30. if prefixes:
  31. self.prefixes.update(prefixes)
  32. self.prefixes.update(G_PREFIXES)
  33. self.prefixStack = [G_PREFIXES.values()] + (prefixesInScope or [])
  34. self.prefixCounter = 0
  35. def getValue(self):
  36. return "".join(self.writelist)
  37. def getPrefix(self, uri):
  38. if uri not in self.prefixes:
  39. self.prefixes[uri] = "xn%d" % (self.prefixCounter)
  40. self.prefixCounter = self.prefixCounter + 1
  41. return self.prefixes[uri]
  42. def prefixInScope(self, prefix):
  43. stack = self.prefixStack
  44. for i in range(-1, (len(self.prefixStack) + 1) * -1, -1):
  45. if prefix in stack[i]:
  46. return True
  47. return False
  48. def serialize(self, elem, closeElement=1, defaultUri=""):
  49. # Optimization shortcuts
  50. write = self.writelist.append
  51. # Shortcut, check to see if elem is actually a chunk o' serialized XML
  52. if isinstance(elem, SerializedXML):
  53. write(elem)
  54. return
  55. # Shortcut, check to see if elem is actually a string (aka Cdata)
  56. if isinstance(elem, str):
  57. write(escapeToXml(elem))
  58. return
  59. # Further optimizations
  60. name = elem.name
  61. uri = elem.uri
  62. defaultUri, currentDefaultUri = elem.defaultUri, defaultUri
  63. for p, u in elem.localPrefixes.items():
  64. self.prefixes[u] = p
  65. self.prefixStack.append(list(elem.localPrefixes.keys()))
  66. # Inherit the default namespace
  67. if defaultUri is None:
  68. defaultUri = currentDefaultUri
  69. if uri is None:
  70. uri = defaultUri
  71. prefix = None
  72. if uri != defaultUri or uri in self.prefixes:
  73. prefix = self.getPrefix(uri)
  74. inScope = self.prefixInScope(prefix)
  75. # Create the starttag
  76. if not prefix:
  77. write("<%s" % (name))
  78. else:
  79. write(f"<{prefix}:{name}")
  80. if not inScope:
  81. write(f" xmlns:{prefix}='{uri}'")
  82. self.prefixStack[-1].append(prefix)
  83. inScope = True
  84. if defaultUri != currentDefaultUri and (
  85. uri != defaultUri or not prefix or not inScope
  86. ):
  87. write(" xmlns='%s'" % (defaultUri))
  88. for p, u in elem.localPrefixes.items():
  89. write(f" xmlns:{p}='{u}'")
  90. # Serialize attributes
  91. for k, v in elem.attributes.items():
  92. # If the attribute name is a tuple, it's a qualified attribute
  93. if isinstance(k, tuple):
  94. attr_uri, attr_name = k
  95. attr_prefix = self.getPrefix(attr_uri)
  96. if not self.prefixInScope(attr_prefix):
  97. write(f" xmlns:{attr_prefix}='{attr_uri}'")
  98. self.prefixStack[-1].append(attr_prefix)
  99. write(f" {attr_prefix}:{attr_name}='{escapeToXml(v, 1)}'")
  100. else:
  101. write(f" {k}='{escapeToXml(v, 1)}'")
  102. # Shortcut out if this is only going to return
  103. # the element (i.e. no children)
  104. if closeElement == 0:
  105. write(">")
  106. return
  107. # Serialize children
  108. if len(elem.children) > 0:
  109. write(">")
  110. for c in elem.children:
  111. self.serialize(c, defaultUri=defaultUri)
  112. # Add closing tag
  113. if not prefix:
  114. write("</%s>" % (name))
  115. else:
  116. write(f"</{prefix}:{name}>")
  117. else:
  118. write("/>")
  119. self.prefixStack.pop()
  120. SerializerClass = _ListSerializer
  121. def escapeToXml(text, isattrib=0):
  122. """Escape text to proper XML form, per section 2.3 in the XML specification.
  123. @type text: C{str}
  124. @param text: Text to escape
  125. @type isattrib: C{bool}
  126. @param isattrib: Triggers escaping of characters necessary for use as
  127. attribute values
  128. """
  129. text = text.replace("&", "&amp;")
  130. text = text.replace("<", "&lt;")
  131. text = text.replace(">", "&gt;")
  132. if isattrib == 1:
  133. text = text.replace("'", "&apos;")
  134. text = text.replace('"', "&quot;")
  135. return text
  136. def unescapeFromXml(text):
  137. text = text.replace("&lt;", "<")
  138. text = text.replace("&gt;", ">")
  139. text = text.replace("&apos;", "'")
  140. text = text.replace("&quot;", '"')
  141. text = text.replace("&amp;", "&")
  142. return text
  143. def generateOnlyInterface(list, int):
  144. """Filters items in a list by class"""
  145. for n in list:
  146. if int.providedBy(n):
  147. yield n
  148. def generateElementsQNamed(list, name, uri):
  149. """Filters Element items in a list with matching name and URI."""
  150. for n in list:
  151. if IElement.providedBy(n) and n.name == name and n.uri == uri:
  152. yield n
  153. def generateElementsNamed(list, name):
  154. """Filters Element items in a list with matching name, regardless of URI."""
  155. for n in list:
  156. if IElement.providedBy(n) and n.name == name:
  157. yield n
  158. class SerializedXML(str):
  159. """Marker class for pre-serialized XML in the DOM."""
  160. pass
  161. class Namespace:
  162. """Convenience object for tracking namespace declarations."""
  163. def __init__(self, uri):
  164. self._uri = uri
  165. def __getattr__(self, n):
  166. return (self._uri, n)
  167. def __getitem__(self, n):
  168. return (self._uri, n)
  169. class IElement(Interface):
  170. """
  171. Interface to XML element nodes.
  172. See L{Element} for a detailed example of its general use.
  173. Warning: this Interface is not yet complete!
  174. """
  175. uri = Attribute(""" Element's namespace URI """)
  176. name = Attribute(""" Element's local name """)
  177. defaultUri = Attribute(""" Default namespace URI of child elements """)
  178. attributes = Attribute(""" Dictionary of element attributes """)
  179. children = Attribute(""" List of child nodes """)
  180. parent = Attribute(""" Reference to element's parent element """)
  181. localPrefixes = Attribute(""" Dictionary of local prefixes """)
  182. def toXml(prefixes=None, closeElement=1, defaultUri="", prefixesInScope=None):
  183. """Serializes object to a (partial) XML document
  184. @param prefixes: dictionary that maps namespace URIs to suggested
  185. prefix names.
  186. @type prefixes: L{dict}
  187. @param closeElement: flag that determines whether to include the
  188. closing tag of the element in the serialized string. A value of
  189. C{0} only generates the element's start tag. A value of C{1} yields
  190. a complete serialization.
  191. @type closeElement: L{int}
  192. @param defaultUri: Initial default namespace URI. This is most useful
  193. for partial rendering, where the logical parent element (of which
  194. the starttag was already serialized) declares a default namespace
  195. that should be inherited.
  196. @type defaultUri: L{str}
  197. @param prefixesInScope: list of prefixes that are assumed to be
  198. declared by ancestors.
  199. @type prefixesInScope: L{list}
  200. @return: (partial) serialized XML
  201. @rtype: L{str}
  202. """
  203. def addElement(name, defaultUri=None, content=None):
  204. """
  205. Create an element and add as child.
  206. The new element is added to this element as a child, and will have
  207. this element as its parent.
  208. @param name: element name. This can be either a L{str} object that
  209. contains the local name, or a tuple of (uri, local_name) for a
  210. fully qualified name. In the former case, the namespace URI is
  211. inherited from this element.
  212. @type name: L{str} or L{tuple} of (L{str}, L{str})
  213. @param defaultUri: default namespace URI for child elements. If
  214. L{None}, this is inherited from this element.
  215. @type defaultUri: L{str}
  216. @param content: text contained by the new element.
  217. @type content: L{str}
  218. @return: the created element
  219. @rtype: object providing L{IElement}
  220. """
  221. def addChild(node):
  222. """
  223. Adds a node as child of this element.
  224. The C{node} will be added to the list of childs of this element, and
  225. will have this element set as its parent when C{node} provides
  226. L{IElement}. If C{node} is a L{str} and the current last child is
  227. character data (L{str}), the text from C{node} is appended to the
  228. existing last child.
  229. @param node: the child node.
  230. @type node: L{str} or object implementing L{IElement}
  231. """
  232. def addContent(text):
  233. """
  234. Adds character data to this element.
  235. If the current last child of this element is a string, the text will
  236. be appended to that string. Otherwise, the text will be added as a new
  237. child.
  238. @param text: The character data to be added to this element.
  239. @type text: L{str}
  240. """
  241. @implementer(IElement)
  242. class Element:
  243. """Represents an XML element node.
  244. An Element contains a series of attributes (name/value pairs), content
  245. (character data), and other child Element objects. When building a document
  246. with markup (such as HTML or XML), use this object as the starting point.
  247. Element objects fully support XML Namespaces. The fully qualified name of
  248. the XML Element it represents is stored in the C{uri} and C{name}
  249. attributes, where C{uri} holds the namespace URI. There is also a default
  250. namespace, for child elements. This is stored in the C{defaultUri}
  251. attribute. Note that C{''} means the empty namespace.
  252. Serialization of Elements through C{toXml()} will use these attributes
  253. for generating proper serialized XML. When both C{uri} and C{defaultUri}
  254. are not None in the Element and all of its descendents, serialization
  255. proceeds as expected:
  256. >>> from twisted.words.xish import domish
  257. >>> root = domish.Element(('myns', 'root'))
  258. >>> root.addElement('child', content='test')
  259. <twisted.words.xish.domish.Element object at 0x83002ac>
  260. >>> root.toXml()
  261. u"<root xmlns='myns'><child>test</child></root>"
  262. For partial serialization, needed for streaming XML, a special value for
  263. namespace URIs can be used: L{None}.
  264. Using L{None} as the value for C{uri} means: this element is in whatever
  265. namespace inherited by the closest logical ancestor when the complete XML
  266. document has been serialized. The serialized start tag will have a
  267. non-prefixed name, and no xmlns declaration will be generated.
  268. Similarly, L{None} for C{defaultUri} means: the default namespace for my
  269. child elements is inherited from the logical ancestors of this element,
  270. when the complete XML document has been serialized.
  271. To illustrate, an example from a Jabber stream. Assume the start tag of the
  272. root element of the stream has already been serialized, along with several
  273. complete child elements, and sent off, looking like this::
  274. <stream:stream xmlns:stream='http://etherx.jabber.org/streams'
  275. xmlns='jabber:client' to='example.com'>
  276. ...
  277. Now suppose we want to send a complete element represented by an
  278. object C{message} created like:
  279. >>> message = domish.Element((None, 'message'))
  280. >>> message['to'] = 'user@example.com'
  281. >>> message.addElement('body', content='Hi!')
  282. <twisted.words.xish.domish.Element object at 0x8276e8c>
  283. >>> message.toXml()
  284. u"<message to='user@example.com'><body>Hi!</body></message>"
  285. As, you can see, this XML snippet has no xmlns declaration. When sent
  286. off, it inherits the C{jabber:client} namespace from the root element.
  287. Note that this renders the same as using C{''} instead of L{None}:
  288. >>> presence = domish.Element(('', 'presence'))
  289. >>> presence.toXml()
  290. u"<presence/>"
  291. However, if this object has a parent defined, the difference becomes
  292. clear:
  293. >>> child = message.addElement(('http://example.com/', 'envelope'))
  294. >>> child.addChild(presence)
  295. <twisted.words.xish.domish.Element object at 0x8276fac>
  296. >>> message.toXml()
  297. u"<message to='user@example.com'><body>Hi!</body><envelope xmlns='http://example.com/'><presence xmlns=''/></envelope></message>"
  298. As, you can see, the <presence/> element is now in the empty namespace, not
  299. in the default namespace of the parent or the streams'.
  300. @type uri: L{str} or None
  301. @ivar uri: URI of this Element's name
  302. @type name: L{str}
  303. @ivar name: Name of this Element
  304. @type defaultUri: L{str} or None
  305. @ivar defaultUri: URI this Element exists within
  306. @type children: L{list}
  307. @ivar children: List of child Elements and content
  308. @type parent: L{Element}
  309. @ivar parent: Reference to the parent Element, if any.
  310. @type attributes: L{dict}
  311. @ivar attributes: Dictionary of attributes associated with this Element.
  312. @type localPrefixes: L{dict}
  313. @ivar localPrefixes: Dictionary of namespace declarations on this
  314. element. The key is the prefix to bind the
  315. namespace uri to.
  316. """
  317. _idCounter = 0
  318. def __init__(self, qname, defaultUri=None, attribs=None, localPrefixes=None):
  319. """
  320. @param qname: Tuple of (uri, name)
  321. @param defaultUri: The default URI of the element; defaults to the URI
  322. specified in C{qname}
  323. @param attribs: Dictionary of attributes
  324. @param localPrefixes: Dictionary of namespace declarations on this
  325. element. The key is the prefix to bind the
  326. namespace uri to.
  327. """
  328. self.localPrefixes = localPrefixes or {}
  329. self.uri, self.name = qname
  330. if defaultUri is None and self.uri not in self.localPrefixes.values():
  331. self.defaultUri = self.uri
  332. else:
  333. self.defaultUri = defaultUri
  334. self.attributes = attribs or {}
  335. self.children = []
  336. self.parent = None
  337. def __getattr__(self, key):
  338. # Check child list for first Element with a name matching the key
  339. for n in self.children:
  340. if IElement.providedBy(n) and n.name == key:
  341. return n
  342. # Tweak the behaviour so that it's more friendly about not
  343. # finding elements -- we need to document this somewhere :)
  344. if key.startswith("_"):
  345. raise AttributeError(key)
  346. else:
  347. return None
  348. def __getitem__(self, key):
  349. return self.attributes[self._dqa(key)]
  350. def __delitem__(self, key):
  351. del self.attributes[self._dqa(key)]
  352. def __setitem__(self, key, value):
  353. self.attributes[self._dqa(key)] = value
  354. def __unicode__(self):
  355. """
  356. Retrieve the first CData (content) node
  357. """
  358. for n in self.children:
  359. if isinstance(n, str):
  360. return n
  361. return ""
  362. def __bytes__(self):
  363. """
  364. Retrieve the first character data node as UTF-8 bytes.
  365. """
  366. return str(self).encode("utf-8")
  367. __str__ = __unicode__
  368. def _dqa(self, attr):
  369. """Dequalify an attribute key as needed"""
  370. if isinstance(attr, tuple) and not attr[0]:
  371. return attr[1]
  372. else:
  373. return attr
  374. def getAttribute(self, attribname, default=None):
  375. """Retrieve the value of attribname, if it exists"""
  376. return self.attributes.get(attribname, default)
  377. def hasAttribute(self, attrib):
  378. """Determine if the specified attribute exists"""
  379. return self._dqa(attrib) in self.attributes
  380. def compareAttribute(self, attrib, value):
  381. """Safely compare the value of an attribute against a provided value.
  382. L{None}-safe.
  383. """
  384. return self.attributes.get(self._dqa(attrib), None) == value
  385. def swapAttributeValues(self, left, right):
  386. """Swap the values of two attribute."""
  387. d = self.attributes
  388. l = d[left]
  389. d[left] = d[right]
  390. d[right] = l
  391. def addChild(self, node):
  392. """Add a child to this Element."""
  393. if IElement.providedBy(node):
  394. node.parent = self
  395. self.children.append(node)
  396. return node
  397. def addContent(self, text: str) -> str:
  398. """Add some text data to this Element."""
  399. if not isinstance(text, str):
  400. raise TypeError(f"Expected str not {text!r} ({type(text).__name__})")
  401. c = self.children
  402. if len(c) > 0 and isinstance(c[-1], str):
  403. c[-1] = c[-1] + text
  404. else:
  405. c.append(text)
  406. return cast(str, c[-1])
  407. def addElement(self, name, defaultUri=None, content=None):
  408. if isinstance(name, tuple):
  409. if defaultUri is None:
  410. defaultUri = name[0]
  411. child = Element(name, defaultUri)
  412. else:
  413. if defaultUri is None:
  414. defaultUri = self.defaultUri
  415. child = Element((defaultUri, name), defaultUri)
  416. self.addChild(child)
  417. if content:
  418. child.addContent(content)
  419. return child
  420. def addRawXml(self, rawxmlstring):
  421. """Add a pre-serialized chunk o' XML as a child of this Element."""
  422. self.children.append(SerializedXML(rawxmlstring))
  423. def addUniqueId(self):
  424. """Add a unique (across a given Python session) id attribute to this
  425. Element.
  426. """
  427. self.attributes["id"] = "H_%d" % Element._idCounter
  428. Element._idCounter = Element._idCounter + 1
  429. def elements(self, uri=None, name=None):
  430. """
  431. Iterate across all children of this Element that are Elements.
  432. Returns a generator over the child elements. If both the C{uri} and
  433. C{name} parameters are set, the returned generator will only yield
  434. on elements matching the qualified name.
  435. @param uri: Optional element URI.
  436. @type uri: L{str}
  437. @param name: Optional element name.
  438. @type name: L{str}
  439. @return: Iterator that yields objects implementing L{IElement}.
  440. """
  441. if name is None:
  442. return generateOnlyInterface(self.children, IElement)
  443. else:
  444. return generateElementsQNamed(self.children, name, uri)
  445. def toXml(self, prefixes=None, closeElement=1, defaultUri="", prefixesInScope=None):
  446. """Serialize this Element and all children to a string."""
  447. s = SerializerClass(prefixes=prefixes, prefixesInScope=prefixesInScope)
  448. s.serialize(self, closeElement=closeElement, defaultUri=defaultUri)
  449. return s.getValue()
  450. def firstChildElement(self):
  451. for c in self.children:
  452. if IElement.providedBy(c):
  453. return c
  454. return None
  455. class ParserError(Exception):
  456. """Exception thrown when a parsing error occurs"""
  457. pass
  458. def elementStream():
  459. """Preferred method to construct an ElementStream
  460. Uses Expat-based stream if available, and falls back to Sux if necessary.
  461. """
  462. try:
  463. es = ExpatElementStream()
  464. return es
  465. except ImportError:
  466. if SuxElementStream is None:
  467. raise Exception("No parsers available :(")
  468. es = SuxElementStream()
  469. return es
  470. class SuxElementStream(sux.XMLParser):
  471. def __init__(self):
  472. self.connectionMade()
  473. self.DocumentStartEvent = None
  474. self.ElementEvent = None
  475. self.DocumentEndEvent = None
  476. self.currElem = None
  477. self.rootElem = None
  478. self.documentStarted = False
  479. self.defaultNsStack = []
  480. self.prefixStack = []
  481. def parse(self, buffer):
  482. try:
  483. self.dataReceived(buffer)
  484. except sux.ParseError as e:
  485. raise ParserError(str(e))
  486. def findUri(self, prefix):
  487. # Walk prefix stack backwards, looking for the uri
  488. # matching the specified prefix
  489. stack = self.prefixStack
  490. for i in range(-1, (len(self.prefixStack) + 1) * -1, -1):
  491. if prefix in stack[i]:
  492. return stack[i][prefix]
  493. return None
  494. def gotTagStart(self, name, attributes):
  495. defaultUri = None
  496. localPrefixes = {}
  497. attribs = {}
  498. uri = None
  499. # Pass 1 - Identify namespace decls
  500. for k, v in list(attributes.items()):
  501. if k.startswith("xmlns"):
  502. x, p = _splitPrefix(k)
  503. if x is None: # I.e. default declaration
  504. defaultUri = v
  505. else:
  506. localPrefixes[p] = v
  507. del attributes[k]
  508. # Push namespace decls onto prefix stack
  509. self.prefixStack.append(localPrefixes)
  510. # Determine default namespace for this element; if there
  511. # is one
  512. if defaultUri is None:
  513. if len(self.defaultNsStack) > 0:
  514. defaultUri = self.defaultNsStack[-1]
  515. else:
  516. defaultUri = ""
  517. # Fix up name
  518. prefix, name = _splitPrefix(name)
  519. if prefix is None: # This element is in the default namespace
  520. uri = defaultUri
  521. else:
  522. # Find the URI for the prefix
  523. uri = self.findUri(prefix)
  524. # Pass 2 - Fix up and escape attributes
  525. for k, v in attributes.items():
  526. p, n = _splitPrefix(k)
  527. if p is None:
  528. attribs[n] = v
  529. else:
  530. attribs[(self.findUri(p)), n] = unescapeFromXml(v)
  531. # Construct the actual Element object
  532. e = Element((uri, name), defaultUri, attribs, localPrefixes)
  533. # Save current default namespace
  534. self.defaultNsStack.append(defaultUri)
  535. # Document already started
  536. if self.documentStarted:
  537. # Starting a new packet
  538. if self.currElem is None:
  539. self.currElem = e
  540. # Adding to existing element
  541. else:
  542. self.currElem = self.currElem.addChild(e)
  543. # New document
  544. else:
  545. self.rootElem = e
  546. self.documentStarted = True
  547. self.DocumentStartEvent(e)
  548. def gotText(self, data):
  549. if self.currElem is not None:
  550. if isinstance(data, bytes):
  551. data = data.decode("ascii")
  552. self.currElem.addContent(data)
  553. def gotCData(self, data):
  554. if self.currElem is not None:
  555. if isinstance(data, bytes):
  556. data = data.decode("ascii")
  557. self.currElem.addContent(data)
  558. def gotComment(self, data):
  559. # Ignore comments for the moment
  560. pass
  561. entities = {
  562. "amp": "&",
  563. "lt": "<",
  564. "gt": ">",
  565. "apos": "'",
  566. "quot": '"',
  567. }
  568. def gotEntityReference(self, entityRef):
  569. # If this is an entity we know about, add it as content
  570. # to the current element
  571. if entityRef in SuxElementStream.entities:
  572. data = SuxElementStream.entities[entityRef]
  573. if isinstance(data, bytes):
  574. data = data.decode("ascii")
  575. self.currElem.addContent(data)
  576. def gotTagEnd(self, name):
  577. # Ensure the document hasn't already ended
  578. if self.rootElem is None:
  579. # XXX: Write more legible explanation
  580. raise ParserError("Element closed after end of document.")
  581. # Fix up name
  582. prefix, name = _splitPrefix(name)
  583. if prefix is None:
  584. uri = self.defaultNsStack[-1]
  585. else:
  586. uri = self.findUri(prefix)
  587. # End of document
  588. if self.currElem is None:
  589. # Ensure element name and uri matches
  590. if self.rootElem.name != name or self.rootElem.uri != uri:
  591. raise ParserError("Mismatched root elements")
  592. self.DocumentEndEvent()
  593. self.rootElem = None
  594. # Other elements
  595. else:
  596. # Ensure the tag being closed matches the name of the current
  597. # element
  598. if self.currElem.name != name or self.currElem.uri != uri:
  599. # XXX: Write more legible explanation
  600. raise ParserError("Malformed element close")
  601. # Pop prefix and default NS stack
  602. self.prefixStack.pop()
  603. self.defaultNsStack.pop()
  604. # Check for parent null parent of current elem;
  605. # that's the top of the stack
  606. if self.currElem.parent is None:
  607. self.currElem.parent = self.rootElem
  608. self.ElementEvent(self.currElem)
  609. self.currElem = None
  610. # Anything else is just some element wrapping up
  611. else:
  612. self.currElem = self.currElem.parent
  613. class ExpatElementStream:
  614. def __init__(self):
  615. import pyexpat
  616. self.DocumentStartEvent = None
  617. self.ElementEvent = None
  618. self.DocumentEndEvent = None
  619. self.error = pyexpat.error
  620. self.parser = pyexpat.ParserCreate("UTF-8", " ")
  621. self.parser.StartElementHandler = self._onStartElement
  622. self.parser.EndElementHandler = self._onEndElement
  623. self.parser.CharacterDataHandler = self._onCdata
  624. self.parser.StartNamespaceDeclHandler = self._onStartNamespace
  625. self.parser.EndNamespaceDeclHandler = self._onEndNamespace
  626. self.currElem = None
  627. self.defaultNsStack = [""]
  628. self.documentStarted = 0
  629. self.localPrefixes = {}
  630. def parse(self, buffer):
  631. try:
  632. self.parser.Parse(buffer)
  633. except self.error as e:
  634. raise ParserError(str(e))
  635. def _onStartElement(self, name, attrs):
  636. # Generate a qname tuple from the provided name. See
  637. # http://docs.python.org/library/pyexpat.html#xml.parsers.expat.ParserCreate
  638. # for an explanation of the formatting of name.
  639. qname = name.rsplit(" ", 1)
  640. if len(qname) == 1:
  641. qname = ("", name)
  642. # Process attributes
  643. newAttrs = {}
  644. toDelete = []
  645. for k, v in attrs.items():
  646. if " " in k:
  647. aqname = k.rsplit(" ", 1)
  648. newAttrs[(aqname[0], aqname[1])] = v
  649. toDelete.append(k)
  650. attrs.update(newAttrs)
  651. for k in toDelete:
  652. del attrs[k]
  653. # Construct the new element
  654. e = Element(qname, self.defaultNsStack[-1], attrs, self.localPrefixes)
  655. self.localPrefixes = {}
  656. # Document already started
  657. if self.documentStarted == 1:
  658. if self.currElem != None:
  659. self.currElem.children.append(e)
  660. e.parent = self.currElem
  661. self.currElem = e
  662. # New document
  663. else:
  664. self.documentStarted = 1
  665. self.DocumentStartEvent(e)
  666. def _onEndElement(self, _):
  667. # Check for null current elem; end of doc
  668. if self.currElem is None:
  669. self.DocumentEndEvent()
  670. # Check for parent that is None; that's
  671. # the top of the stack
  672. elif self.currElem.parent is None:
  673. self.ElementEvent(self.currElem)
  674. self.currElem = None
  675. # Anything else is just some element in the current
  676. # packet wrapping up
  677. else:
  678. self.currElem = self.currElem.parent
  679. def _onCdata(self, data):
  680. if self.currElem != None:
  681. self.currElem.addContent(data)
  682. def _onStartNamespace(self, prefix, uri):
  683. # If this is the default namespace, put
  684. # it on the stack
  685. if prefix is None:
  686. self.defaultNsStack.append(uri)
  687. else:
  688. self.localPrefixes[prefix] = uri
  689. def _onEndNamespace(self, prefix):
  690. # Remove last element on the stack
  691. if prefix is None:
  692. self.defaultNsStack.pop()
  693. ## class FileParser(ElementStream):
  694. ## def __init__(self):
  695. ## ElementStream.__init__(self)
  696. ## self.DocumentStartEvent = self.docStart
  697. ## self.ElementEvent = self.elem
  698. ## self.DocumentEndEvent = self.docEnd
  699. ## self.done = 0
  700. ## def docStart(self, elem):
  701. ## self.document = elem
  702. ## def elem(self, elem):
  703. ## self.document.addChild(elem)
  704. ## def docEnd(self):
  705. ## self.done = 1
  706. ## def parse(self, filename):
  707. ## with open(filename) as f:
  708. ## for l in f.readlines():
  709. ## self.parser.Parse(l)
  710. ## assert self.done == 1
  711. ## return self.document
  712. ## def parseFile(filename):
  713. ## return FileParser().parse(filename)