xmltodict.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. #!/usr/bin/env python
  2. "Makes working with XML feel like you are working with JSON"
  3. try:
  4. from defusedexpat import pyexpat as expat
  5. except ImportError:
  6. from xml.parsers import expat
  7. from xml.sax.saxutils import XMLGenerator
  8. from xml.sax.xmlreader import AttributesImpl
  9. try: # pragma no cover
  10. from cStringIO import StringIO
  11. except ImportError: # pragma no cover
  12. try:
  13. from StringIO import StringIO
  14. except ImportError:
  15. from io import StringIO
  16. _dict = dict
  17. import platform
  18. if tuple(map(int, platform.python_version_tuple()[:2])) < (3, 7):
  19. from collections import OrderedDict as _dict
  20. from inspect import isgenerator
  21. try: # pragma no cover
  22. _basestring = basestring
  23. except NameError: # pragma no cover
  24. _basestring = str
  25. try: # pragma no cover
  26. _unicode = unicode
  27. except NameError: # pragma no cover
  28. _unicode = str
  29. __author__ = 'Martin Blech'
  30. __version__ = '0.13.0'
  31. __license__ = 'MIT'
  32. class ParsingInterrupted(Exception):
  33. pass
  34. class _DictSAXHandler(object):
  35. def __init__(self,
  36. item_depth=0,
  37. item_callback=lambda *args: True,
  38. xml_attribs=True,
  39. attr_prefix='@',
  40. cdata_key='#text',
  41. force_cdata=False,
  42. cdata_separator='',
  43. postprocessor=None,
  44. dict_constructor=_dict,
  45. strip_whitespace=True,
  46. namespace_separator=':',
  47. namespaces=None,
  48. force_list=None,
  49. comment_key='#comment'):
  50. self.path = []
  51. self.stack = []
  52. self.data = []
  53. self.item = None
  54. self.item_depth = item_depth
  55. self.xml_attribs = xml_attribs
  56. self.item_callback = item_callback
  57. self.attr_prefix = attr_prefix
  58. self.cdata_key = cdata_key
  59. self.force_cdata = force_cdata
  60. self.cdata_separator = cdata_separator
  61. self.postprocessor = postprocessor
  62. self.dict_constructor = dict_constructor
  63. self.strip_whitespace = strip_whitespace
  64. self.namespace_separator = namespace_separator
  65. self.namespaces = namespaces
  66. self.namespace_declarations = dict_constructor()
  67. self.force_list = force_list
  68. self.comment_key = comment_key
  69. def _build_name(self, full_name):
  70. if self.namespaces is None:
  71. return full_name
  72. i = full_name.rfind(self.namespace_separator)
  73. if i == -1:
  74. return full_name
  75. namespace, name = full_name[:i], full_name[i+1:]
  76. try:
  77. short_namespace = self.namespaces[namespace]
  78. except KeyError:
  79. short_namespace = namespace
  80. if not short_namespace:
  81. return name
  82. else:
  83. return self.namespace_separator.join((short_namespace, name))
  84. def _attrs_to_dict(self, attrs):
  85. if isinstance(attrs, dict):
  86. return attrs
  87. return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
  88. def startNamespaceDecl(self, prefix, uri):
  89. self.namespace_declarations[prefix or ''] = uri
  90. def startElement(self, full_name, attrs):
  91. name = self._build_name(full_name)
  92. attrs = self._attrs_to_dict(attrs)
  93. if attrs and self.namespace_declarations:
  94. attrs['xmlns'] = self.namespace_declarations
  95. self.namespace_declarations = self.dict_constructor()
  96. self.path.append((name, attrs or None))
  97. if len(self.path) > self.item_depth:
  98. self.stack.append((self.item, self.data))
  99. if self.xml_attribs:
  100. attr_entries = []
  101. for key, value in attrs.items():
  102. key = self.attr_prefix+self._build_name(key)
  103. if self.postprocessor:
  104. entry = self.postprocessor(self.path, key, value)
  105. else:
  106. entry = (key, value)
  107. if entry:
  108. attr_entries.append(entry)
  109. attrs = self.dict_constructor(attr_entries)
  110. else:
  111. attrs = None
  112. self.item = attrs or None
  113. self.data = []
  114. def endElement(self, full_name):
  115. name = self._build_name(full_name)
  116. if len(self.path) == self.item_depth:
  117. item = self.item
  118. if item is None:
  119. item = (None if not self.data
  120. else self.cdata_separator.join(self.data))
  121. should_continue = self.item_callback(self.path, item)
  122. if not should_continue:
  123. raise ParsingInterrupted()
  124. if self.stack:
  125. data = (None if not self.data
  126. else self.cdata_separator.join(self.data))
  127. item = self.item
  128. self.item, self.data = self.stack.pop()
  129. if self.strip_whitespace and data:
  130. data = data.strip() or None
  131. if data and self.force_cdata and item is None:
  132. item = self.dict_constructor()
  133. if item is not None:
  134. if data:
  135. self.push_data(item, self.cdata_key, data)
  136. self.item = self.push_data(self.item, name, item)
  137. else:
  138. self.item = self.push_data(self.item, name, data)
  139. else:
  140. self.item = None
  141. self.data = []
  142. self.path.pop()
  143. def characters(self, data):
  144. if not self.data:
  145. self.data = [data]
  146. else:
  147. self.data.append(data)
  148. def comments(self, data):
  149. if self.strip_whitespace:
  150. data = data.strip()
  151. self.item = self.push_data(self.item, self.comment_key, data)
  152. def push_data(self, item, key, data):
  153. if self.postprocessor is not None:
  154. result = self.postprocessor(self.path, key, data)
  155. if result is None:
  156. return item
  157. key, data = result
  158. if item is None:
  159. item = self.dict_constructor()
  160. try:
  161. value = item[key]
  162. if isinstance(value, list):
  163. value.append(data)
  164. else:
  165. item[key] = [value, data]
  166. except KeyError:
  167. if self._should_force_list(key, data):
  168. item[key] = [data]
  169. else:
  170. item[key] = data
  171. return item
  172. def _should_force_list(self, key, value):
  173. if not self.force_list:
  174. return False
  175. if isinstance(self.force_list, bool):
  176. return self.force_list
  177. try:
  178. return key in self.force_list
  179. except TypeError:
  180. return self.force_list(self.path[:-1], key, value)
  181. def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
  182. namespace_separator=':', disable_entities=True, process_comments=False, **kwargs):
  183. """Parse the given XML input and convert it into a dictionary.
  184. `xml_input` can either be a `string`, a file-like object, or a generator of strings.
  185. If `xml_attribs` is `True`, element attributes are put in the dictionary
  186. among regular child elements, using `@` as a prefix to avoid collisions. If
  187. set to `False`, they are just ignored.
  188. Simple example::
  189. >>> import xmltodict
  190. >>> doc = xmltodict.parse(\"\"\"
  191. ... <a prop="x">
  192. ... <b>1</b>
  193. ... <b>2</b>
  194. ... </a>
  195. ... \"\"\")
  196. >>> doc['a']['@prop']
  197. u'x'
  198. >>> doc['a']['b']
  199. [u'1', u'2']
  200. If `item_depth` is `0`, the function returns a dictionary for the root
  201. element (default behavior). Otherwise, it calls `item_callback` every time
  202. an item at the specified depth is found and returns `None` in the end
  203. (streaming mode).
  204. The callback function receives two parameters: the `path` from the document
  205. root to the item (name-attribs pairs), and the `item` (dict). If the
  206. callback's return value is false-ish, parsing will be stopped with the
  207. :class:`ParsingInterrupted` exception.
  208. Streaming example::
  209. >>> def handle(path, item):
  210. ... print('path:%s item:%s' % (path, item))
  211. ... return True
  212. ...
  213. >>> xmltodict.parse(\"\"\"
  214. ... <a prop="x">
  215. ... <b>1</b>
  216. ... <b>2</b>
  217. ... </a>\"\"\", item_depth=2, item_callback=handle)
  218. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
  219. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
  220. The optional argument `postprocessor` is a function that takes `path`,
  221. `key` and `value` as positional arguments and returns a new `(key, value)`
  222. pair where both `key` and `value` may have changed. Usage example::
  223. >>> def postprocessor(path, key, value):
  224. ... try:
  225. ... return key + ':int', int(value)
  226. ... except (ValueError, TypeError):
  227. ... return key, value
  228. >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
  229. ... postprocessor=postprocessor)
  230. {'a': {'b:int': [1, 2], 'b': 'x'}}
  231. You can pass an alternate version of `expat` (such as `defusedexpat`) by
  232. using the `expat` parameter. E.g:
  233. >>> import defusedexpat
  234. >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
  235. {'a': 'hello'}
  236. You can use the force_list argument to force lists to be created even
  237. when there is only a single child of a given level of hierarchy. The
  238. force_list argument is a tuple of keys. If the key for a given level
  239. of hierarchy is in the force_list argument, that level of hierarchy
  240. will have a list as a child (even if there is only one sub-element).
  241. The index_keys operation takes precedence over this. This is applied
  242. after any user-supplied postprocessor has already run.
  243. For example, given this input:
  244. <servers>
  245. <server>
  246. <name>host1</name>
  247. <os>Linux</os>
  248. <interfaces>
  249. <interface>
  250. <name>em0</name>
  251. <ip_address>10.0.0.1</ip_address>
  252. </interface>
  253. </interfaces>
  254. </server>
  255. </servers>
  256. If called with force_list=('interface',), it will produce
  257. this dictionary:
  258. {'servers':
  259. {'server':
  260. {'name': 'host1',
  261. 'os': 'Linux'},
  262. 'interfaces':
  263. {'interface':
  264. [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
  265. `force_list` can also be a callable that receives `path`, `key` and
  266. `value`. This is helpful in cases where the logic that decides whether
  267. a list should be forced is more complex.
  268. If `process_comment` is `True` then comment will be added with comment_key
  269. (default=`'#comment'`) to then tag which contains comment
  270. For example, given this input:
  271. <a>
  272. <b>
  273. <!-- b comment -->
  274. <c>
  275. <!-- c comment -->
  276. 1
  277. </c>
  278. <d>2</d>
  279. </b>
  280. </a>
  281. If called with process_comment=True, it will produce
  282. this dictionary:
  283. 'a': {
  284. 'b': {
  285. '#comment': 'b comment',
  286. 'c': {
  287. '#comment': 'c comment',
  288. '#text': '1',
  289. },
  290. 'd': '2',
  291. },
  292. }
  293. """
  294. handler = _DictSAXHandler(namespace_separator=namespace_separator,
  295. **kwargs)
  296. if isinstance(xml_input, _unicode):
  297. if not encoding:
  298. encoding = 'utf-8'
  299. xml_input = xml_input.encode(encoding)
  300. if not process_namespaces:
  301. namespace_separator = None
  302. parser = expat.ParserCreate(
  303. encoding,
  304. namespace_separator
  305. )
  306. try:
  307. parser.ordered_attributes = True
  308. except AttributeError:
  309. # Jython's expat does not support ordered_attributes
  310. pass
  311. parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
  312. parser.StartElementHandler = handler.startElement
  313. parser.EndElementHandler = handler.endElement
  314. parser.CharacterDataHandler = handler.characters
  315. if process_comments:
  316. parser.CommentHandler = handler.comments
  317. parser.buffer_text = True
  318. if disable_entities:
  319. try:
  320. # Attempt to disable DTD in Jython's expat parser (Xerces-J).
  321. feature = "http://apache.org/xml/features/disallow-doctype-decl"
  322. parser._reader.setFeature(feature, True)
  323. except AttributeError:
  324. # For CPython / expat parser.
  325. # Anything not handled ends up here and entities aren't expanded.
  326. parser.DefaultHandler = lambda x: None
  327. # Expects an integer return; zero means failure -> expat.ExpatError.
  328. parser.ExternalEntityRefHandler = lambda *x: 1
  329. if hasattr(xml_input, 'read'):
  330. parser.ParseFile(xml_input)
  331. elif isgenerator(xml_input):
  332. for chunk in xml_input:
  333. parser.Parse(chunk,False)
  334. parser.Parse(b'',True)
  335. else:
  336. parser.Parse(xml_input, True)
  337. return handler.item
  338. def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'):
  339. if not namespaces:
  340. return name
  341. try:
  342. ns, name = name.rsplit(ns_sep, 1)
  343. except ValueError:
  344. pass
  345. else:
  346. ns_res = namespaces.get(ns.strip(attr_prefix))
  347. name = '{}{}{}{}'.format(
  348. attr_prefix if ns.startswith(attr_prefix) else '',
  349. ns_res, ns_sep, name) if ns_res else name
  350. return name
  351. def _emit(key, value, content_handler,
  352. attr_prefix='@',
  353. cdata_key='#text',
  354. depth=0,
  355. preprocessor=None,
  356. pretty=False,
  357. newl='\n',
  358. indent='\t',
  359. namespace_separator=':',
  360. namespaces=None,
  361. full_document=True,
  362. expand_iter=None):
  363. key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
  364. if preprocessor is not None:
  365. result = preprocessor(key, value)
  366. if result is None:
  367. return
  368. key, value = result
  369. if (not hasattr(value, '__iter__')
  370. or isinstance(value, _basestring)
  371. or isinstance(value, dict)):
  372. value = [value]
  373. for index, v in enumerate(value):
  374. if full_document and depth == 0 and index > 0:
  375. raise ValueError('document with multiple roots')
  376. if v is None:
  377. v = _dict()
  378. elif isinstance(v, bool):
  379. if v:
  380. v = _unicode('true')
  381. else:
  382. v = _unicode('false')
  383. elif not isinstance(v, dict):
  384. if expand_iter and hasattr(v, '__iter__') and not isinstance(v, _basestring):
  385. v = _dict(((expand_iter, v),))
  386. else:
  387. v = _unicode(v)
  388. if isinstance(v, _basestring):
  389. v = _dict(((cdata_key, v),))
  390. cdata = None
  391. attrs = _dict()
  392. children = []
  393. for ik, iv in v.items():
  394. if ik == cdata_key:
  395. cdata = iv
  396. continue
  397. if ik.startswith(attr_prefix):
  398. ik = _process_namespace(ik, namespaces, namespace_separator,
  399. attr_prefix)
  400. if ik == '@xmlns' and isinstance(iv, dict):
  401. for k, v in iv.items():
  402. attr = 'xmlns{}'.format(':{}'.format(k) if k else '')
  403. attrs[attr] = _unicode(v)
  404. continue
  405. if not isinstance(iv, _unicode):
  406. iv = _unicode(iv)
  407. attrs[ik[len(attr_prefix):]] = iv
  408. continue
  409. children.append((ik, iv))
  410. if pretty:
  411. content_handler.ignorableWhitespace(depth * indent)
  412. content_handler.startElement(key, AttributesImpl(attrs))
  413. if pretty and children:
  414. content_handler.ignorableWhitespace(newl)
  415. for child_key, child_value in children:
  416. _emit(child_key, child_value, content_handler,
  417. attr_prefix, cdata_key, depth+1, preprocessor,
  418. pretty, newl, indent, namespaces=namespaces,
  419. namespace_separator=namespace_separator,
  420. expand_iter=expand_iter)
  421. if cdata is not None:
  422. content_handler.characters(cdata)
  423. if pretty and children:
  424. content_handler.ignorableWhitespace(depth * indent)
  425. content_handler.endElement(key)
  426. if pretty and depth:
  427. content_handler.ignorableWhitespace(newl)
  428. def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
  429. short_empty_elements=False,
  430. **kwargs):
  431. """Emit an XML document for the given `input_dict` (reverse of `parse`).
  432. The resulting XML document is returned as a string, but if `output` (a
  433. file-like object) is specified, it is written there instead.
  434. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
  435. as XML node attributes, whereas keys equal to `cdata_key`
  436. (default=`'#text'`) are treated as character data.
  437. The `pretty` parameter (default=`False`) enables pretty-printing. In this
  438. mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
  439. can be customized with the `newl` and `indent` parameters.
  440. """
  441. if full_document and len(input_dict) != 1:
  442. raise ValueError('Document must have exactly one root.')
  443. must_return = False
  444. if output is None:
  445. output = StringIO()
  446. must_return = True
  447. if short_empty_elements:
  448. content_handler = XMLGenerator(output, encoding, True)
  449. else:
  450. content_handler = XMLGenerator(output, encoding)
  451. if full_document:
  452. content_handler.startDocument()
  453. for key, value in input_dict.items():
  454. _emit(key, value, content_handler, full_document=full_document,
  455. **kwargs)
  456. if full_document:
  457. content_handler.endDocument()
  458. if must_return:
  459. value = output.getvalue()
  460. try: # pragma no cover
  461. value = value.decode(encoding)
  462. except AttributeError: # pragma no cover
  463. pass
  464. return value
  465. if __name__ == '__main__': # pragma: no cover
  466. import sys
  467. import marshal
  468. try:
  469. stdin = sys.stdin.buffer
  470. stdout = sys.stdout.buffer
  471. except AttributeError:
  472. stdin = sys.stdin
  473. stdout = sys.stdout
  474. (item_depth,) = sys.argv[1:]
  475. item_depth = int(item_depth)
  476. def handle_item(path, item):
  477. marshal.dump((path, item), stdout)
  478. return True
  479. try:
  480. root = parse(stdin,
  481. item_depth=item_depth,
  482. item_callback=handle_item,
  483. dict_constructor=dict)
  484. if item_depth == 0:
  485. handle_item([], root)
  486. except KeyboardInterrupt:
  487. pass