domish.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899
  1. # -*- test-case-name: twisted.words.test.test_domish -*-
  2. # Copyright (c) Twisted Matrix Laboratories.
  3. # See LICENSE for details.
  4. """
  5. DOM-like XML processing support.
  6. This module provides support for parsing XML into DOM-like object structures
  7. and serializing such structures to an XML string representation, optimized
  8. for use in streaming XML applications.
  9. """
  10. from __future__ import absolute_import, division
  11. from zope.interface import implementer, Interface, Attribute
  12. from twisted.python.compat import (_PY3, StringType, _coercedUnicode,
  13. iteritems, itervalues, unicode)
  14. def _splitPrefix(name):
  15. """ Internal method for splitting a prefixed Element name into its
  16. respective parts """
  17. ntok = name.split(":", 1)
  18. if len(ntok) == 2:
  19. return ntok
  20. else:
  21. return (None, ntok[0])
  22. # Global map of prefixes that always get injected
  23. # into the serializers prefix map (note, that doesn't
  24. # mean they're always _USED_)
  25. G_PREFIXES = { "http://www.w3.org/XML/1998/namespace":"xml" }
  26. class _ListSerializer:
  27. """ Internal class which serializes an Element tree into a buffer """
  28. def __init__(self, prefixes=None, prefixesInScope=None):
  29. self.writelist = []
  30. self.prefixes = {}
  31. if prefixes:
  32. self.prefixes.update(prefixes)
  33. self.prefixes.update(G_PREFIXES)
  34. self.prefixStack = [G_PREFIXES.values()] + (prefixesInScope or [])
  35. self.prefixCounter = 0
  36. def getValue(self):
  37. return u"".join(self.writelist)
  38. def getPrefix(self, uri):
  39. if uri not in self.prefixes:
  40. self.prefixes[uri] = "xn%d" % (self.prefixCounter)
  41. self.prefixCounter = self.prefixCounter + 1
  42. return self.prefixes[uri]
  43. def prefixInScope(self, prefix):
  44. stack = self.prefixStack
  45. for i in range(-1, (len(self.prefixStack)+1) * -1, -1):
  46. if prefix in stack[i]:
  47. return True
  48. return False
  49. def serialize(self, elem, closeElement=1, defaultUri=''):
  50. # Optimization shortcuts
  51. write = self.writelist.append
  52. # Shortcut, check to see if elem is actually a chunk o' serialized XML
  53. if isinstance(elem, SerializedXML):
  54. write(elem)
  55. return
  56. # Shortcut, check to see if elem is actually a string (aka Cdata)
  57. if isinstance(elem, StringType):
  58. write(escapeToXml(elem))
  59. return
  60. # Further optimizations
  61. name = elem.name
  62. uri = elem.uri
  63. defaultUri, currentDefaultUri = elem.defaultUri, defaultUri
  64. for p, u in iteritems(elem.localPrefixes):
  65. self.prefixes[u] = p
  66. self.prefixStack.append(list(elem.localPrefixes.keys()))
  67. # Inherit the default namespace
  68. if defaultUri is None:
  69. defaultUri = currentDefaultUri
  70. if uri is None:
  71. uri = defaultUri
  72. prefix = None
  73. if uri != defaultUri or uri in self.prefixes:
  74. prefix = self.getPrefix(uri)
  75. inScope = self.prefixInScope(prefix)
  76. # Create the starttag
  77. if not prefix:
  78. write("<%s" % (name))
  79. else:
  80. write("<%s:%s" % (prefix, name))
  81. if not inScope:
  82. write(" xmlns:%s='%s'" % (prefix, uri))
  83. self.prefixStack[-1].append(prefix)
  84. inScope = True
  85. if defaultUri != currentDefaultUri and \
  86. (uri != defaultUri or not prefix or not inScope):
  87. write(" xmlns='%s'" % (defaultUri))
  88. for p, u in iteritems(elem.localPrefixes):
  89. write(" xmlns:%s='%s'" % (p, u))
  90. # Serialize attributes
  91. for k,v in elem.attributes.items():
  92. # If the attribute name is a tuple, it's a qualified attribute
  93. if isinstance(k, tuple):
  94. attr_uri, attr_name = k
  95. attr_prefix = self.getPrefix(attr_uri)
  96. if not self.prefixInScope(attr_prefix):
  97. write(" xmlns:%s='%s'" % (attr_prefix, attr_uri))
  98. self.prefixStack[-1].append(attr_prefix)
  99. write(" %s:%s='%s'" % (attr_prefix, attr_name,
  100. escapeToXml(v, 1)))
  101. else:
  102. write((" %s='%s'" % ( k, escapeToXml(v, 1))))
  103. # Shortcut out if this is only going to return
  104. # the element (i.e. no children)
  105. if closeElement == 0:
  106. write(">")
  107. return
  108. # Serialize children
  109. if len(elem.children) > 0:
  110. write(">")
  111. for c in elem.children:
  112. self.serialize(c, defaultUri=defaultUri)
  113. # Add closing tag
  114. if not prefix:
  115. write("</%s>" % (name))
  116. else:
  117. write("</%s:%s>" % (prefix, name))
  118. else:
  119. write("/>")
  120. self.prefixStack.pop()
  121. SerializerClass = _ListSerializer
  122. def escapeToXml(text, isattrib = 0):
  123. """ Escape text to proper XML form, per section 2.3 in the XML specification.
  124. @type text: C{str}
  125. @param text: Text to escape
  126. @type isattrib: C{bool}
  127. @param isattrib: Triggers escaping of characters necessary for use as
  128. attribute values
  129. """
  130. text = text.replace("&", "&amp;")
  131. text = text.replace("<", "&lt;")
  132. text = text.replace(">", "&gt;")
  133. if isattrib == 1:
  134. text = text.replace("'", "&apos;")
  135. text = text.replace("\"", "&quot;")
  136. return text
  137. def unescapeFromXml(text):
  138. text = text.replace("&lt;", "<")
  139. text = text.replace("&gt;", ">")
  140. text = text.replace("&apos;", "'")
  141. text = text.replace("&quot;", "\"")
  142. text = text.replace("&amp;", "&")
  143. return text
  144. def generateOnlyInterface(list, int):
  145. """ Filters items in a list by class
  146. """
  147. for n in list:
  148. if int.providedBy(n):
  149. yield n
  150. def generateElementsQNamed(list, name, uri):
  151. """ Filters Element items in a list with matching name and URI. """
  152. for n in list:
  153. if IElement.providedBy(n) and n.name == name and n.uri == uri:
  154. yield n
  155. def generateElementsNamed(list, name):
  156. """ Filters Element items in a list with matching name, regardless of URI.
  157. """
  158. for n in list:
  159. if IElement.providedBy(n) and n.name == name:
  160. yield n
  161. class SerializedXML(unicode):
  162. """ Marker class for pre-serialized XML in the DOM. """
  163. pass
  164. class Namespace:
  165. """ Convenience object for tracking namespace declarations. """
  166. def __init__(self, uri):
  167. self._uri = uri
  168. def __getattr__(self, n):
  169. return (self._uri, n)
  170. def __getitem__(self, n):
  171. return (self._uri, n)
  172. class IElement(Interface):
  173. """
  174. Interface to XML element nodes.
  175. See L{Element} for a detailed example of its general use.
  176. Warning: this Interface is not yet complete!
  177. """
  178. uri = Attribute(""" Element's namespace URI """)
  179. name = Attribute(""" Element's local name """)
  180. defaultUri = Attribute(""" Default namespace URI of child elements """)
  181. attributes = Attribute(""" Dictionary of element attributes """)
  182. children = Attribute(""" List of child nodes """)
  183. parent = Attribute(""" Reference to element's parent element """)
  184. localPrefixes = Attribute(""" Dictionary of local prefixes """)
  185. def toXml(prefixes=None, closeElement=1, defaultUri='',
  186. prefixesInScope=None):
  187. """ Serializes object to a (partial) XML document
  188. @param prefixes: dictionary that maps namespace URIs to suggested
  189. prefix names.
  190. @type prefixes: L{dict}
  191. @param closeElement: flag that determines whether to include the
  192. closing tag of the element in the serialized string. A value of
  193. C{0} only generates the element's start tag. A value of C{1} yields
  194. a complete serialization.
  195. @type closeElement: L{int}
  196. @param defaultUri: Initial default namespace URI. This is most useful
  197. for partial rendering, where the logical parent element (of which
  198. the starttag was already serialized) declares a default namespace
  199. that should be inherited.
  200. @type defaultUri: L{unicode}
  201. @param prefixesInScope: list of prefixes that are assumed to be
  202. declared by ancestors.
  203. @type prefixesInScope: C{list}
  204. @return: (partial) serialized XML
  205. @rtype: C{unicode}
  206. """
  207. def addElement(name, defaultUri=None, content=None):
  208. """
  209. Create an element and add as child.
  210. The new element is added to this element as a child, and will have
  211. this element as its parent.
  212. @param name: element name. This can be either a L{unicode} object that
  213. contains the local name, or a tuple of (uri, local_name) for a
  214. fully qualified name. In the former case, the namespace URI is
  215. inherited from this element.
  216. @type name: L{unicode} or L{tuple} of (L{unicode}, L{unicode})
  217. @param defaultUri: default namespace URI for child elements. If
  218. L{None}, this is inherited from this element.
  219. @type defaultUri: L{unicode}
  220. @param content: text contained by the new element.
  221. @type content: L{unicode}
  222. @return: the created element
  223. @rtype: object providing L{IElement}
  224. """
  225. def addChild(node):
  226. """
  227. Adds a node as child of this element.
  228. The C{node} will be added to the list of childs of this element, and
  229. will have this element set as its parent when C{node} provides
  230. L{IElement}. If C{node} is a L{unicode} and the current last child is
  231. character data (L{unicode}), the text from C{node} is appended to the
  232. existing last child.
  233. @param node: the child node.
  234. @type node: L{unicode} or object implementing L{IElement}
  235. """
  236. def addContent(text):
  237. """
  238. Adds character data to this element.
  239. If the current last child of this element is a string, the text will
  240. be appended to that string. Otherwise, the text will be added as a new
  241. child.
  242. @param text: The character data to be added to this element.
  243. @type text: L{unicode}
  244. """
  245. @implementer(IElement)
  246. class Element(object):
  247. """ Represents an XML element node.
  248. An Element contains a series of attributes (name/value pairs), content
  249. (character data), and other child Element objects. When building a document
  250. with markup (such as HTML or XML), use this object as the starting point.
  251. Element objects fully support XML Namespaces. The fully qualified name of
  252. the XML Element it represents is stored in the C{uri} and C{name}
  253. attributes, where C{uri} holds the namespace URI. There is also a default
  254. namespace, for child elements. This is stored in the C{defaultUri}
  255. attribute. Note that C{''} means the empty namespace.
  256. Serialization of Elements through C{toXml()} will use these attributes
  257. for generating proper serialized XML. When both C{uri} and C{defaultUri}
  258. are not None in the Element and all of its descendents, serialization
  259. proceeds as expected:
  260. >>> from twisted.words.xish import domish
  261. >>> root = domish.Element(('myns', 'root'))
  262. >>> root.addElement('child', content='test')
  263. <twisted.words.xish.domish.Element object at 0x83002ac>
  264. >>> root.toXml()
  265. u"<root xmlns='myns'><child>test</child></root>"
  266. For partial serialization, needed for streaming XML, a special value for
  267. namespace URIs can be used: L{None}.
  268. Using L{None} as the value for C{uri} means: this element is in whatever
  269. namespace inherited by the closest logical ancestor when the complete XML
  270. document has been serialized. The serialized start tag will have a
  271. non-prefixed name, and no xmlns declaration will be generated.
  272. Similarly, L{None} for C{defaultUri} means: the default namespace for my
  273. child elements is inherited from the logical ancestors of this element,
  274. when the complete XML document has been serialized.
  275. To illustrate, an example from a Jabber stream. Assume the start tag of the
  276. root element of the stream has already been serialized, along with several
  277. complete child elements, and sent off, looking like this::
  278. <stream:stream xmlns:stream='http://etherx.jabber.org/streams'
  279. xmlns='jabber:client' to='example.com'>
  280. ...
  281. Now suppose we want to send a complete element represented by an
  282. object C{message} created like:
  283. >>> message = domish.Element((None, 'message'))
  284. >>> message['to'] = 'user@example.com'
  285. >>> message.addElement('body', content='Hi!')
  286. <twisted.words.xish.domish.Element object at 0x8276e8c>
  287. >>> message.toXml()
  288. u"<message to='user@example.com'><body>Hi!</body></message>"
  289. As, you can see, this XML snippet has no xmlns declaration. When sent
  290. off, it inherits the C{jabber:client} namespace from the root element.
  291. Note that this renders the same as using C{''} instead of L{None}:
  292. >>> presence = domish.Element(('', 'presence'))
  293. >>> presence.toXml()
  294. u"<presence/>"
  295. However, if this object has a parent defined, the difference becomes
  296. clear:
  297. >>> child = message.addElement(('http://example.com/', 'envelope'))
  298. >>> child.addChild(presence)
  299. <twisted.words.xish.domish.Element object at 0x8276fac>
  300. >>> message.toXml()
  301. u"<message to='user@example.com'><body>Hi!</body><envelope xmlns='http://example.com/'><presence xmlns=''/></envelope></message>"
  302. As, you can see, the <presence/> element is now in the empty namespace, not
  303. in the default namespace of the parent or the streams'.
  304. @type uri: C{unicode} or None
  305. @ivar uri: URI of this Element's name
  306. @type name: C{unicode}
  307. @ivar name: Name of this Element
  308. @type defaultUri: C{unicode} or None
  309. @ivar defaultUri: URI this Element exists within
  310. @type children: C{list}
  311. @ivar children: List of child Elements and content
  312. @type parent: L{Element}
  313. @ivar parent: Reference to the parent Element, if any.
  314. @type attributes: L{dict}
  315. @ivar attributes: Dictionary of attributes associated with this Element.
  316. @type localPrefixes: L{dict}
  317. @ivar localPrefixes: Dictionary of namespace declarations on this
  318. element. The key is the prefix to bind the
  319. namespace uri to.
  320. """
  321. _idCounter = 0
  322. def __init__(self, qname, defaultUri=None, attribs=None,
  323. localPrefixes=None):
  324. """
  325. @param qname: Tuple of (uri, name)
  326. @param defaultUri: The default URI of the element; defaults to the URI
  327. specified in C{qname}
  328. @param attribs: Dictionary of attributes
  329. @param localPrefixes: Dictionary of namespace declarations on this
  330. element. The key is the prefix to bind the
  331. namespace uri to.
  332. """
  333. self.localPrefixes = localPrefixes or {}
  334. self.uri, self.name = qname
  335. if defaultUri is None and \
  336. self.uri not in itervalues(self.localPrefixes):
  337. self.defaultUri = self.uri
  338. else:
  339. self.defaultUri = defaultUri
  340. self.attributes = attribs or {}
  341. self.children = []
  342. self.parent = None
  343. def __getattr__(self, key):
  344. # Check child list for first Element with a name matching the key
  345. for n in self.children:
  346. if IElement.providedBy(n) and n.name == key:
  347. return n
  348. # Tweak the behaviour so that it's more friendly about not
  349. # finding elements -- we need to document this somewhere :)
  350. if key.startswith('_'):
  351. raise AttributeError(key)
  352. else:
  353. return None
  354. def __getitem__(self, key):
  355. return self.attributes[self._dqa(key)]
  356. def __delitem__(self, key):
  357. del self.attributes[self._dqa(key)];
  358. def __setitem__(self, key, value):
  359. self.attributes[self._dqa(key)] = value
  360. def __unicode__(self):
  361. """
  362. Retrieve the first CData (content) node
  363. """
  364. for n in self.children:
  365. if isinstance(n, StringType):
  366. return n
  367. return u""
  368. def __bytes__(self):
  369. """
  370. Retrieve the first character data node as UTF-8 bytes.
  371. """
  372. return unicode(self).encode('utf-8')
  373. if _PY3:
  374. __str__ = __unicode__
  375. else:
  376. __str__ = __bytes__
  377. def _dqa(self, attr):
  378. """ Dequalify an attribute key as needed """
  379. if isinstance(attr, tuple) and not attr[0]:
  380. return attr[1]
  381. else:
  382. return attr
  383. def getAttribute(self, attribname, default = None):
  384. """ Retrieve the value of attribname, if it exists """
  385. return self.attributes.get(attribname, default)
  386. def hasAttribute(self, attrib):
  387. """ Determine if the specified attribute exists """
  388. return self._dqa(attrib) in self.attributes
  389. def compareAttribute(self, attrib, value):
  390. """ Safely compare the value of an attribute against a provided value.
  391. L{None}-safe.
  392. """
  393. return self.attributes.get(self._dqa(attrib), None) == value
  394. def swapAttributeValues(self, left, right):
  395. """ Swap the values of two attribute. """
  396. d = self.attributes
  397. l = d[left]
  398. d[left] = d[right]
  399. d[right] = l
  400. def addChild(self, node):
  401. """ Add a child to this Element. """
  402. if IElement.providedBy(node):
  403. node.parent = self
  404. self.children.append(node)
  405. return node
  406. def addContent(self, text):
  407. """ Add some text data to this Element. """
  408. text = _coercedUnicode(text)
  409. c = self.children
  410. if len(c) > 0 and isinstance(c[-1], unicode):
  411. c[-1] = c[-1] + text
  412. else:
  413. c.append(text)
  414. return c[-1]
  415. def addElement(self, name, defaultUri = None, content = None):
  416. if isinstance(name, tuple):
  417. if defaultUri is None:
  418. defaultUri = name[0]
  419. child = Element(name, defaultUri)
  420. else:
  421. if defaultUri is None:
  422. defaultUri = self.defaultUri
  423. child = Element((defaultUri, name), defaultUri)
  424. self.addChild(child)
  425. if content:
  426. child.addContent(content)
  427. return child
  428. def addRawXml(self, rawxmlstring):
  429. """ Add a pre-serialized chunk o' XML as a child of this Element. """
  430. self.children.append(SerializedXML(rawxmlstring))
  431. def addUniqueId(self):
  432. """ Add a unique (across a given Python session) id attribute to this
  433. Element.
  434. """
  435. self.attributes["id"] = "H_%d" % Element._idCounter
  436. Element._idCounter = Element._idCounter + 1
  437. def elements(self, uri=None, name=None):
  438. """
  439. Iterate across all children of this Element that are Elements.
  440. Returns a generator over the child elements. If both the C{uri} and
  441. C{name} parameters are set, the returned generator will only yield
  442. on elements matching the qualified name.
  443. @param uri: Optional element URI.
  444. @type uri: C{unicode}
  445. @param name: Optional element name.
  446. @type name: C{unicode}
  447. @return: Iterator that yields objects implementing L{IElement}.
  448. """
  449. if name is None:
  450. return generateOnlyInterface(self.children, IElement)
  451. else:
  452. return generateElementsQNamed(self.children, name, uri)
  453. def toXml(self, prefixes=None, closeElement=1, defaultUri='',
  454. prefixesInScope=None):
  455. """ Serialize this Element and all children to a string. """
  456. s = SerializerClass(prefixes=prefixes, prefixesInScope=prefixesInScope)
  457. s.serialize(self, closeElement=closeElement, defaultUri=defaultUri)
  458. return s.getValue()
  459. def firstChildElement(self):
  460. for c in self.children:
  461. if IElement.providedBy(c):
  462. return c
  463. return None
  464. class ParserError(Exception):
  465. """ Exception thrown when a parsing error occurs """
  466. pass
  467. def elementStream():
  468. """ Preferred method to construct an ElementStream
  469. Uses Expat-based stream if available, and falls back to Sux if necessary.
  470. """
  471. try:
  472. es = ExpatElementStream()
  473. return es
  474. except ImportError:
  475. if SuxElementStream is None:
  476. raise Exception("No parsers available :(")
  477. es = SuxElementStream()
  478. return es
  479. try:
  480. from twisted.web import sux
  481. except:
  482. SuxElementStream = None
  483. else:
  484. class SuxElementStream(sux.XMLParser):
  485. def __init__(self):
  486. self.connectionMade()
  487. self.DocumentStartEvent = None
  488. self.ElementEvent = None
  489. self.DocumentEndEvent = None
  490. self.currElem = None
  491. self.rootElem = None
  492. self.documentStarted = False
  493. self.defaultNsStack = []
  494. self.prefixStack = []
  495. def parse(self, buffer):
  496. try:
  497. self.dataReceived(buffer)
  498. except sux.ParseError as e:
  499. raise ParserError(str(e))
  500. def findUri(self, prefix):
  501. # Walk prefix stack backwards, looking for the uri
  502. # matching the specified prefix
  503. stack = self.prefixStack
  504. for i in range(-1, (len(self.prefixStack)+1) * -1, -1):
  505. if prefix in stack[i]:
  506. return stack[i][prefix]
  507. return None
  508. def gotTagStart(self, name, attributes):
  509. defaultUri = None
  510. localPrefixes = {}
  511. attribs = {}
  512. uri = None
  513. # Pass 1 - Identify namespace decls
  514. for k, v in list(attributes.items()):
  515. if k.startswith("xmlns"):
  516. x, p = _splitPrefix(k)
  517. if (x is None): # I.e. default declaration
  518. defaultUri = v
  519. else:
  520. localPrefixes[p] = v
  521. del attributes[k]
  522. # Push namespace decls onto prefix stack
  523. self.prefixStack.append(localPrefixes)
  524. # Determine default namespace for this element; if there
  525. # is one
  526. if defaultUri is None:
  527. if len(self.defaultNsStack) > 0:
  528. defaultUri = self.defaultNsStack[-1]
  529. else:
  530. defaultUri = ''
  531. # Fix up name
  532. prefix, name = _splitPrefix(name)
  533. if prefix is None: # This element is in the default namespace
  534. uri = defaultUri
  535. else:
  536. # Find the URI for the prefix
  537. uri = self.findUri(prefix)
  538. # Pass 2 - Fix up and escape attributes
  539. for k, v in attributes.items():
  540. p, n = _splitPrefix(k)
  541. if p is None:
  542. attribs[n] = v
  543. else:
  544. attribs[(self.findUri(p)), n] = unescapeFromXml(v)
  545. # Construct the actual Element object
  546. e = Element((uri, name), defaultUri, attribs, localPrefixes)
  547. # Save current default namespace
  548. self.defaultNsStack.append(defaultUri)
  549. # Document already started
  550. if self.documentStarted:
  551. # Starting a new packet
  552. if self.currElem is None:
  553. self.currElem = e
  554. # Adding to existing element
  555. else:
  556. self.currElem = self.currElem.addChild(e)
  557. # New document
  558. else:
  559. self.rootElem = e
  560. self.documentStarted = True
  561. self.DocumentStartEvent(e)
  562. def gotText(self, data):
  563. if self.currElem != None:
  564. if isinstance(data, bytes):
  565. data = data.decode('ascii')
  566. self.currElem.addContent(data)
  567. def gotCData(self, data):
  568. if self.currElem != None:
  569. if isinstance(data, bytes):
  570. data = data.decode('ascii')
  571. self.currElem.addContent(data)
  572. def gotComment(self, data):
  573. # Ignore comments for the moment
  574. pass
  575. entities = { "amp" : "&",
  576. "lt" : "<",
  577. "gt" : ">",
  578. "apos": "'",
  579. "quot": "\"" }
  580. def gotEntityReference(self, entityRef):
  581. # If this is an entity we know about, add it as content
  582. # to the current element
  583. if entityRef in SuxElementStream.entities:
  584. data = SuxElementStream.entities[entityRef]
  585. if isinstance(data, bytes):
  586. data = data.decode('ascii')
  587. self.currElem.addContent(data)
  588. def gotTagEnd(self, name):
  589. # Ensure the document hasn't already ended
  590. if self.rootElem is None:
  591. # XXX: Write more legible explanation
  592. raise ParserError("Element closed after end of document.")
  593. # Fix up name
  594. prefix, name = _splitPrefix(name)
  595. if prefix is None:
  596. uri = self.defaultNsStack[-1]
  597. else:
  598. uri = self.findUri(prefix)
  599. # End of document
  600. if self.currElem is None:
  601. # Ensure element name and uri matches
  602. if self.rootElem.name != name or self.rootElem.uri != uri:
  603. raise ParserError("Mismatched root elements")
  604. self.DocumentEndEvent()
  605. self.rootElem = None
  606. # Other elements
  607. else:
  608. # Ensure the tag being closed matches the name of the current
  609. # element
  610. if self.currElem.name != name or self.currElem.uri != uri:
  611. # XXX: Write more legible explanation
  612. raise ParserError("Malformed element close")
  613. # Pop prefix and default NS stack
  614. self.prefixStack.pop()
  615. self.defaultNsStack.pop()
  616. # Check for parent null parent of current elem;
  617. # that's the top of the stack
  618. if self.currElem.parent is None:
  619. self.currElem.parent = self.rootElem
  620. self.ElementEvent(self.currElem)
  621. self.currElem = None
  622. # Anything else is just some element wrapping up
  623. else:
  624. self.currElem = self.currElem.parent
  625. class ExpatElementStream:
  626. def __init__(self):
  627. import pyexpat
  628. self.DocumentStartEvent = None
  629. self.ElementEvent = None
  630. self.DocumentEndEvent = None
  631. self.error = pyexpat.error
  632. self.parser = pyexpat.ParserCreate("UTF-8", " ")
  633. self.parser.StartElementHandler = self._onStartElement
  634. self.parser.EndElementHandler = self._onEndElement
  635. self.parser.CharacterDataHandler = self._onCdata
  636. self.parser.StartNamespaceDeclHandler = self._onStartNamespace
  637. self.parser.EndNamespaceDeclHandler = self._onEndNamespace
  638. self.currElem = None
  639. self.defaultNsStack = ['']
  640. self.documentStarted = 0
  641. self.localPrefixes = {}
  642. def parse(self, buffer):
  643. try:
  644. self.parser.Parse(buffer)
  645. except self.error as e:
  646. raise ParserError(str(e))
  647. def _onStartElement(self, name, attrs):
  648. # Generate a qname tuple from the provided name. See
  649. # http://docs.python.org/library/pyexpat.html#xml.parsers.expat.ParserCreate
  650. # for an explanation of the formatting of name.
  651. qname = name.rsplit(" ", 1)
  652. if len(qname) == 1:
  653. qname = ('', name)
  654. # Process attributes
  655. newAttrs = {}
  656. toDelete = []
  657. for k, v in attrs.items():
  658. if " " in k:
  659. aqname = k.rsplit(" ", 1)
  660. newAttrs[(aqname[0], aqname[1])] = v
  661. toDelete.append(k)
  662. attrs.update(newAttrs)
  663. for k in toDelete:
  664. del attrs[k]
  665. # Construct the new element
  666. e = Element(qname, self.defaultNsStack[-1], attrs, self.localPrefixes)
  667. self.localPrefixes = {}
  668. # Document already started
  669. if self.documentStarted == 1:
  670. if self.currElem != None:
  671. self.currElem.children.append(e)
  672. e.parent = self.currElem
  673. self.currElem = e
  674. # New document
  675. else:
  676. self.documentStarted = 1
  677. self.DocumentStartEvent(e)
  678. def _onEndElement(self, _):
  679. # Check for null current elem; end of doc
  680. if self.currElem is None:
  681. self.DocumentEndEvent()
  682. # Check for parent that is None; that's
  683. # the top of the stack
  684. elif self.currElem.parent is None:
  685. self.ElementEvent(self.currElem)
  686. self.currElem = None
  687. # Anything else is just some element in the current
  688. # packet wrapping up
  689. else:
  690. self.currElem = self.currElem.parent
  691. def _onCdata(self, data):
  692. if self.currElem != None:
  693. self.currElem.addContent(data)
  694. def _onStartNamespace(self, prefix, uri):
  695. # If this is the default namespace, put
  696. # it on the stack
  697. if prefix is None:
  698. self.defaultNsStack.append(uri)
  699. else:
  700. self.localPrefixes[prefix] = uri
  701. def _onEndNamespace(self, prefix):
  702. # Remove last element on the stack
  703. if prefix is None:
  704. self.defaultNsStack.pop()
  705. ## class FileParser(ElementStream):
  706. ## def __init__(self):
  707. ## ElementStream.__init__(self)
  708. ## self.DocumentStartEvent = self.docStart
  709. ## self.ElementEvent = self.elem
  710. ## self.DocumentEndEvent = self.docEnd
  711. ## self.done = 0
  712. ## def docStart(self, elem):
  713. ## self.document = elem
  714. ## def elem(self, elem):
  715. ## self.document.addChild(elem)
  716. ## def docEnd(self):
  717. ## self.done = 1
  718. ## def parse(self, filename):
  719. ## with open(filename) as f:
  720. ## for l in f.readlines():
  721. ## self.parser.Parse(l)
  722. ## assert self.done == 1
  723. ## return self.document
  724. ## def parseFile(filename):
  725. ## return FileParser().parse(filename)