xmltodict.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. #!/usr/bin/env python
  2. "Makes working with XML feel like you are working with JSON"
  3. try:
  4. from defusedexpat import pyexpat as expat
  5. except ImportError:
  6. from xml.parsers import expat
  7. from xml.sax.saxutils import XMLGenerator
  8. from xml.sax.xmlreader import AttributesImpl
  9. try: # pragma no cover
  10. from cStringIO import StringIO
  11. except ImportError: # pragma no cover
  12. try:
  13. from StringIO import StringIO
  14. except ImportError:
  15. from io import StringIO
  16. from collections import OrderedDict
  17. try: # pragma no cover
  18. _basestring = basestring
  19. except NameError: # pragma no cover
  20. _basestring = str
  21. try: # pragma no cover
  22. _unicode = unicode
  23. except NameError: # pragma no cover
  24. _unicode = str
  25. __author__ = 'Martin Blech'
  26. __version__ = '0.12.0'
  27. __license__ = 'MIT'
  28. class ParsingInterrupted(Exception):
  29. pass
  30. class _DictSAXHandler(object):
  31. def __init__(self,
  32. item_depth=0,
  33. item_callback=lambda *args: True,
  34. xml_attribs=True,
  35. attr_prefix='@',
  36. cdata_key='#text',
  37. force_cdata=False,
  38. cdata_separator='',
  39. postprocessor=None,
  40. dict_constructor=OrderedDict,
  41. strip_whitespace=True,
  42. namespace_separator=':',
  43. namespaces=None,
  44. force_list=None):
  45. self.path = []
  46. self.stack = []
  47. self.data = []
  48. self.item = None
  49. self.item_depth = item_depth
  50. self.xml_attribs = xml_attribs
  51. self.item_callback = item_callback
  52. self.attr_prefix = attr_prefix
  53. self.cdata_key = cdata_key
  54. self.force_cdata = force_cdata
  55. self.cdata_separator = cdata_separator
  56. self.postprocessor = postprocessor
  57. self.dict_constructor = dict_constructor
  58. self.strip_whitespace = strip_whitespace
  59. self.namespace_separator = namespace_separator
  60. self.namespaces = namespaces
  61. self.namespace_declarations = OrderedDict()
  62. self.force_list = force_list
  63. def _build_name(self, full_name):
  64. if not self.namespaces:
  65. return full_name
  66. i = full_name.rfind(self.namespace_separator)
  67. if i == -1:
  68. return full_name
  69. namespace, name = full_name[:i], full_name[i+1:]
  70. short_namespace = self.namespaces.get(namespace, namespace)
  71. if not short_namespace:
  72. return name
  73. else:
  74. return self.namespace_separator.join((short_namespace, name))
  75. def _attrs_to_dict(self, attrs):
  76. if isinstance(attrs, dict):
  77. return attrs
  78. return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
  79. def startNamespaceDecl(self, prefix, uri):
  80. self.namespace_declarations[prefix or ''] = uri
  81. def startElement(self, full_name, attrs):
  82. name = self._build_name(full_name)
  83. attrs = self._attrs_to_dict(attrs)
  84. if attrs and self.namespace_declarations:
  85. attrs['xmlns'] = self.namespace_declarations
  86. self.namespace_declarations = OrderedDict()
  87. self.path.append((name, attrs or None))
  88. if len(self.path) > self.item_depth:
  89. self.stack.append((self.item, self.data))
  90. if self.xml_attribs:
  91. attr_entries = []
  92. for key, value in attrs.items():
  93. key = self.attr_prefix+self._build_name(key)
  94. if self.postprocessor:
  95. entry = self.postprocessor(self.path, key, value)
  96. else:
  97. entry = (key, value)
  98. if entry:
  99. attr_entries.append(entry)
  100. attrs = self.dict_constructor(attr_entries)
  101. else:
  102. attrs = None
  103. self.item = attrs or None
  104. self.data = []
  105. def endElement(self, full_name):
  106. name = self._build_name(full_name)
  107. if len(self.path) == self.item_depth:
  108. item = self.item
  109. if item is None:
  110. item = (None if not self.data
  111. else self.cdata_separator.join(self.data))
  112. should_continue = self.item_callback(self.path, item)
  113. if not should_continue:
  114. raise ParsingInterrupted()
  115. if len(self.stack):
  116. data = (None if not self.data
  117. else self.cdata_separator.join(self.data))
  118. item = self.item
  119. self.item, self.data = self.stack.pop()
  120. if self.strip_whitespace and data:
  121. data = data.strip() or None
  122. if data and self.force_cdata and item is None:
  123. item = self.dict_constructor()
  124. if item is not None:
  125. if data:
  126. self.push_data(item, self.cdata_key, data)
  127. self.item = self.push_data(self.item, name, item)
  128. else:
  129. self.item = self.push_data(self.item, name, data)
  130. else:
  131. self.item = None
  132. self.data = []
  133. self.path.pop()
  134. def characters(self, data):
  135. if not self.data:
  136. self.data = [data]
  137. else:
  138. self.data.append(data)
  139. def push_data(self, item, key, data):
  140. if self.postprocessor is not None:
  141. result = self.postprocessor(self.path, key, data)
  142. if result is None:
  143. return item
  144. key, data = result
  145. if item is None:
  146. item = self.dict_constructor()
  147. try:
  148. value = item[key]
  149. if isinstance(value, list):
  150. value.append(data)
  151. else:
  152. item[key] = [value, data]
  153. except KeyError:
  154. if self._should_force_list(key, data):
  155. item[key] = [data]
  156. else:
  157. item[key] = data
  158. return item
  159. def _should_force_list(self, key, value):
  160. if not self.force_list:
  161. return False
  162. if isinstance(self.force_list, bool):
  163. return self.force_list
  164. try:
  165. return key in self.force_list
  166. except TypeError:
  167. return self.force_list(self.path[:-1], key, value)
  168. def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
  169. namespace_separator=':', disable_entities=True, **kwargs):
  170. """Parse the given XML input and convert it into a dictionary.
  171. `xml_input` can either be a `string` or a file-like object.
  172. If `xml_attribs` is `True`, element attributes are put in the dictionary
  173. among regular child elements, using `@` as a prefix to avoid collisions. If
  174. set to `False`, they are just ignored.
  175. Simple example::
  176. >>> import xmltodict
  177. >>> doc = xmltodict.parse(\"\"\"
  178. ... <a prop="x">
  179. ... <b>1</b>
  180. ... <b>2</b>
  181. ... </a>
  182. ... \"\"\")
  183. >>> doc['a']['@prop']
  184. u'x'
  185. >>> doc['a']['b']
  186. [u'1', u'2']
  187. If `item_depth` is `0`, the function returns a dictionary for the root
  188. element (default behavior). Otherwise, it calls `item_callback` every time
  189. an item at the specified depth is found and returns `None` in the end
  190. (streaming mode).
  191. The callback function receives two parameters: the `path` from the document
  192. root to the item (name-attribs pairs), and the `item` (dict). If the
  193. callback's return value is false-ish, parsing will be stopped with the
  194. :class:`ParsingInterrupted` exception.
  195. Streaming example::
  196. >>> def handle(path, item):
  197. ... print('path:%s item:%s' % (path, item))
  198. ... return True
  199. ...
  200. >>> xmltodict.parse(\"\"\"
  201. ... <a prop="x">
  202. ... <b>1</b>
  203. ... <b>2</b>
  204. ... </a>\"\"\", item_depth=2, item_callback=handle)
  205. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
  206. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
  207. The optional argument `postprocessor` is a function that takes `path`,
  208. `key` and `value` as positional arguments and returns a new `(key, value)`
  209. pair where both `key` and `value` may have changed. Usage example::
  210. >>> def postprocessor(path, key, value):
  211. ... try:
  212. ... return key + ':int', int(value)
  213. ... except (ValueError, TypeError):
  214. ... return key, value
  215. >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
  216. ... postprocessor=postprocessor)
  217. OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
  218. You can pass an alternate version of `expat` (such as `defusedexpat`) by
  219. using the `expat` parameter. E.g:
  220. >>> import defusedexpat
  221. >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
  222. OrderedDict([(u'a', u'hello')])
  223. You can use the force_list argument to force lists to be created even
  224. when there is only a single child of a given level of hierarchy. The
  225. force_list argument is a tuple of keys. If the key for a given level
  226. of hierarchy is in the force_list argument, that level of hierarchy
  227. will have a list as a child (even if there is only one sub-element).
  228. The index_keys operation takes precendence over this. This is applied
  229. after any user-supplied postprocessor has already run.
  230. For example, given this input:
  231. <servers>
  232. <server>
  233. <name>host1</name>
  234. <os>Linux</os>
  235. <interfaces>
  236. <interface>
  237. <name>em0</name>
  238. <ip_address>10.0.0.1</ip_address>
  239. </interface>
  240. </interfaces>
  241. </server>
  242. </servers>
  243. If called with force_list=('interface',), it will produce
  244. this dictionary:
  245. {'servers':
  246. {'server':
  247. {'name': 'host1',
  248. 'os': 'Linux'},
  249. 'interfaces':
  250. {'interface':
  251. [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
  252. `force_list` can also be a callable that receives `path`, `key` and
  253. `value`. This is helpful in cases where the logic that decides whether
  254. a list should be forced is more complex.
  255. """
  256. handler = _DictSAXHandler(namespace_separator=namespace_separator,
  257. **kwargs)
  258. if isinstance(xml_input, _unicode):
  259. if not encoding:
  260. encoding = 'utf-8'
  261. xml_input = xml_input.encode(encoding)
  262. if not process_namespaces:
  263. namespace_separator = None
  264. parser = expat.ParserCreate(
  265. encoding,
  266. namespace_separator
  267. )
  268. try:
  269. parser.ordered_attributes = True
  270. except AttributeError:
  271. # Jython's expat does not support ordered_attributes
  272. pass
  273. parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
  274. parser.StartElementHandler = handler.startElement
  275. parser.EndElementHandler = handler.endElement
  276. parser.CharacterDataHandler = handler.characters
  277. parser.buffer_text = True
  278. if disable_entities:
  279. try:
  280. # Attempt to disable DTD in Jython's expat parser (Xerces-J).
  281. feature = "http://apache.org/xml/features/disallow-doctype-decl"
  282. parser._reader.setFeature(feature, True)
  283. except AttributeError:
  284. # For CPython / expat parser.
  285. # Anything not handled ends up here and entities aren't expanded.
  286. parser.DefaultHandler = lambda x: None
  287. # Expects an integer return; zero means failure -> expat.ExpatError.
  288. parser.ExternalEntityRefHandler = lambda *x: 1
  289. if hasattr(xml_input, 'read'):
  290. parser.ParseFile(xml_input)
  291. else:
  292. parser.Parse(xml_input, True)
  293. return handler.item
  294. def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'):
  295. if not namespaces:
  296. return name
  297. try:
  298. ns, name = name.rsplit(ns_sep, 1)
  299. except ValueError:
  300. pass
  301. else:
  302. ns_res = namespaces.get(ns.strip(attr_prefix))
  303. name = '{}{}{}{}'.format(
  304. attr_prefix if ns.startswith(attr_prefix) else '',
  305. ns_res, ns_sep, name) if ns_res else name
  306. return name
  307. def _emit(key, value, content_handler,
  308. attr_prefix='@',
  309. cdata_key='#text',
  310. depth=0,
  311. preprocessor=None,
  312. pretty=False,
  313. newl='\n',
  314. indent='\t',
  315. namespace_separator=':',
  316. namespaces=None,
  317. full_document=True):
  318. key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
  319. if preprocessor is not None:
  320. result = preprocessor(key, value)
  321. if result is None:
  322. return
  323. key, value = result
  324. if (not hasattr(value, '__iter__')
  325. or isinstance(value, _basestring)
  326. or isinstance(value, dict)):
  327. value = [value]
  328. for index, v in enumerate(value):
  329. if full_document and depth == 0 and index > 0:
  330. raise ValueError('document with multiple roots')
  331. if v is None:
  332. v = OrderedDict()
  333. elif isinstance(v, bool):
  334. if v:
  335. v = _unicode('true')
  336. else:
  337. v = _unicode('false')
  338. elif not isinstance(v, dict):
  339. v = _unicode(v)
  340. if isinstance(v, _basestring):
  341. v = OrderedDict(((cdata_key, v),))
  342. cdata = None
  343. attrs = OrderedDict()
  344. children = []
  345. for ik, iv in v.items():
  346. if ik == cdata_key:
  347. cdata = iv
  348. continue
  349. if ik.startswith(attr_prefix):
  350. ik = _process_namespace(ik, namespaces, namespace_separator,
  351. attr_prefix)
  352. if ik == '@xmlns' and isinstance(iv, dict):
  353. for k, v in iv.items():
  354. attr = 'xmlns{}'.format(':{}'.format(k) if k else '')
  355. attrs[attr] = _unicode(v)
  356. continue
  357. if not isinstance(iv, _unicode):
  358. iv = _unicode(iv)
  359. attrs[ik[len(attr_prefix):]] = iv
  360. continue
  361. children.append((ik, iv))
  362. if pretty:
  363. content_handler.ignorableWhitespace(depth * indent)
  364. content_handler.startElement(key, AttributesImpl(attrs))
  365. if pretty and children:
  366. content_handler.ignorableWhitespace(newl)
  367. for child_key, child_value in children:
  368. _emit(child_key, child_value, content_handler,
  369. attr_prefix, cdata_key, depth+1, preprocessor,
  370. pretty, newl, indent, namespaces=namespaces,
  371. namespace_separator=namespace_separator)
  372. if cdata is not None:
  373. content_handler.characters(cdata)
  374. if pretty and children:
  375. content_handler.ignorableWhitespace(depth * indent)
  376. content_handler.endElement(key)
  377. if pretty and depth:
  378. content_handler.ignorableWhitespace(newl)
  379. def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
  380. short_empty_elements=False,
  381. **kwargs):
  382. """Emit an XML document for the given `input_dict` (reverse of `parse`).
  383. The resulting XML document is returned as a string, but if `output` (a
  384. file-like object) is specified, it is written there instead.
  385. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
  386. as XML node attributes, whereas keys equal to `cdata_key`
  387. (default=`'#text'`) are treated as character data.
  388. The `pretty` parameter (default=`False`) enables pretty-printing. In this
  389. mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
  390. can be customized with the `newl` and `indent` parameters.
  391. """
  392. if full_document and len(input_dict) != 1:
  393. raise ValueError('Document must have exactly one root.')
  394. must_return = False
  395. if output is None:
  396. output = StringIO()
  397. must_return = True
  398. if short_empty_elements:
  399. content_handler = XMLGenerator(output, encoding, True)
  400. else:
  401. content_handler = XMLGenerator(output, encoding)
  402. if full_document:
  403. content_handler.startDocument()
  404. for key, value in input_dict.items():
  405. _emit(key, value, content_handler, full_document=full_document,
  406. **kwargs)
  407. if full_document:
  408. content_handler.endDocument()
  409. if must_return:
  410. value = output.getvalue()
  411. try: # pragma no cover
  412. value = value.decode(encoding)
  413. except AttributeError: # pragma no cover
  414. pass
  415. return value
  416. if __name__ == '__main__': # pragma: no cover
  417. import sys
  418. import marshal
  419. try:
  420. stdin = sys.stdin.buffer
  421. stdout = sys.stdout.buffer
  422. except AttributeError:
  423. stdin = sys.stdin
  424. stdout = sys.stdout
  425. (item_depth,) = sys.argv[1:]
  426. item_depth = int(item_depth)
  427. def handle_item(path, item):
  428. marshal.dump((path, item), stdout)
  429. return True
  430. try:
  431. root = parse(stdin,
  432. item_depth=item_depth,
  433. item_callback=handle_item,
  434. dict_constructor=dict)
  435. if item_depth == 0:
  436. handle_item([], root)
  437. except KeyboardInterrupt:
  438. pass