xmltodict.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. #!/usr/bin/env python
  2. "Makes working with XML feel like you are working with JSON"
  3. from xml.parsers import expat
  4. from xml.sax.saxutils import XMLGenerator
  5. from xml.sax.xmlreader import AttributesImpl
  6. from io import StringIO
  7. _dict = dict
  8. import platform
  9. if tuple(map(int, platform.python_version_tuple()[:2])) < (3, 7):
  10. from collections import OrderedDict as _dict
  11. from inspect import isgenerator
  12. __author__ = 'Martin Blech'
  13. __version__ = "0.14.2"
  14. __license__ = 'MIT'
  15. class ParsingInterrupted(Exception):
  16. pass
  17. class _DictSAXHandler:
  18. def __init__(self,
  19. item_depth=0,
  20. item_callback=lambda *args: True,
  21. xml_attribs=True,
  22. attr_prefix='@',
  23. cdata_key='#text',
  24. force_cdata=False,
  25. cdata_separator='',
  26. postprocessor=None,
  27. dict_constructor=_dict,
  28. strip_whitespace=True,
  29. namespace_separator=':',
  30. namespaces=None,
  31. force_list=None,
  32. comment_key='#comment'):
  33. self.path = []
  34. self.stack = []
  35. self.data = []
  36. self.item = None
  37. self.item_depth = item_depth
  38. self.xml_attribs = xml_attribs
  39. self.item_callback = item_callback
  40. self.attr_prefix = attr_prefix
  41. self.cdata_key = cdata_key
  42. self.force_cdata = force_cdata
  43. self.cdata_separator = cdata_separator
  44. self.postprocessor = postprocessor
  45. self.dict_constructor = dict_constructor
  46. self.strip_whitespace = strip_whitespace
  47. self.namespace_separator = namespace_separator
  48. self.namespaces = namespaces
  49. self.namespace_declarations = dict_constructor()
  50. self.force_list = force_list
  51. self.comment_key = comment_key
  52. def _build_name(self, full_name):
  53. if self.namespaces is None:
  54. return full_name
  55. i = full_name.rfind(self.namespace_separator)
  56. if i == -1:
  57. return full_name
  58. namespace, name = full_name[:i], full_name[i+1:]
  59. try:
  60. short_namespace = self.namespaces[namespace]
  61. except KeyError:
  62. short_namespace = namespace
  63. if not short_namespace:
  64. return name
  65. else:
  66. return self.namespace_separator.join((short_namespace, name))
  67. def _attrs_to_dict(self, attrs):
  68. if isinstance(attrs, dict):
  69. return attrs
  70. return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
  71. def startNamespaceDecl(self, prefix, uri):
  72. self.namespace_declarations[prefix or ''] = uri
  73. def startElement(self, full_name, attrs):
  74. name = self._build_name(full_name)
  75. attrs = self._attrs_to_dict(attrs)
  76. if attrs and self.namespace_declarations:
  77. attrs['xmlns'] = self.namespace_declarations
  78. self.namespace_declarations = self.dict_constructor()
  79. self.path.append((name, attrs or None))
  80. if len(self.path) >= self.item_depth:
  81. self.stack.append((self.item, self.data))
  82. if self.xml_attribs:
  83. attr_entries = []
  84. for key, value in attrs.items():
  85. key = self.attr_prefix+self._build_name(key)
  86. if self.postprocessor:
  87. entry = self.postprocessor(self.path, key, value)
  88. else:
  89. entry = (key, value)
  90. if entry:
  91. attr_entries.append(entry)
  92. attrs = self.dict_constructor(attr_entries)
  93. else:
  94. attrs = None
  95. self.item = attrs or None
  96. self.data = []
  97. def endElement(self, full_name):
  98. name = self._build_name(full_name)
  99. if len(self.path) == self.item_depth:
  100. item = self.item
  101. if item is None:
  102. item = (None if not self.data
  103. else self.cdata_separator.join(self.data))
  104. should_continue = self.item_callback(self.path, item)
  105. if not should_continue:
  106. raise ParsingInterrupted
  107. if self.stack:
  108. data = (None if not self.data
  109. else self.cdata_separator.join(self.data))
  110. item = self.item
  111. self.item, self.data = self.stack.pop()
  112. if self.strip_whitespace and data:
  113. data = data.strip() or None
  114. if data and self.force_cdata and item is None:
  115. item = self.dict_constructor()
  116. if item is not None:
  117. if data:
  118. self.push_data(item, self.cdata_key, data)
  119. self.item = self.push_data(self.item, name, item)
  120. else:
  121. self.item = self.push_data(self.item, name, data)
  122. else:
  123. self.item = None
  124. self.data = []
  125. self.path.pop()
  126. def characters(self, data):
  127. if not self.data:
  128. self.data = [data]
  129. else:
  130. self.data.append(data)
  131. def comments(self, data):
  132. if self.strip_whitespace:
  133. data = data.strip()
  134. self.item = self.push_data(self.item, self.comment_key, data)
  135. def push_data(self, item, key, data):
  136. if self.postprocessor is not None:
  137. result = self.postprocessor(self.path, key, data)
  138. if result is None:
  139. return item
  140. key, data = result
  141. if item is None:
  142. item = self.dict_constructor()
  143. try:
  144. value = item[key]
  145. if isinstance(value, list):
  146. value.append(data)
  147. else:
  148. item[key] = [value, data]
  149. except KeyError:
  150. if self._should_force_list(key, data):
  151. item[key] = [data]
  152. else:
  153. item[key] = data
  154. return item
  155. def _should_force_list(self, key, value):
  156. if not self.force_list:
  157. return False
  158. if isinstance(self.force_list, bool):
  159. return self.force_list
  160. try:
  161. return key in self.force_list
  162. except TypeError:
  163. return self.force_list(self.path[:-1], key, value)
  164. def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
  165. namespace_separator=':', disable_entities=True, process_comments=False, **kwargs):
  166. """Parse the given XML input and convert it into a dictionary.
  167. `xml_input` can either be a `string`, a file-like object, or a generator of strings.
  168. If `xml_attribs` is `True`, element attributes are put in the dictionary
  169. among regular child elements, using `@` as a prefix to avoid collisions. If
  170. set to `False`, they are just ignored.
  171. Simple example::
  172. >>> import xmltodict
  173. >>> doc = xmltodict.parse(\"\"\"
  174. ... <a prop="x">
  175. ... <b>1</b>
  176. ... <b>2</b>
  177. ... </a>
  178. ... \"\"\")
  179. >>> doc['a']['@prop']
  180. u'x'
  181. >>> doc['a']['b']
  182. [u'1', u'2']
  183. If `item_depth` is `0`, the function returns a dictionary for the root
  184. element (default behavior). Otherwise, it calls `item_callback` every time
  185. an item at the specified depth is found and returns `None` in the end
  186. (streaming mode).
  187. The callback function receives two parameters: the `path` from the document
  188. root to the item (name-attribs pairs), and the `item` (dict). If the
  189. callback's return value is false-ish, parsing will be stopped with the
  190. :class:`ParsingInterrupted` exception.
  191. Streaming example::
  192. >>> def handle(path, item):
  193. ... print('path:%s item:%s' % (path, item))
  194. ... return True
  195. ...
  196. >>> xmltodict.parse(\"\"\"
  197. ... <a prop="x">
  198. ... <b>1</b>
  199. ... <b>2</b>
  200. ... </a>\"\"\", item_depth=2, item_callback=handle)
  201. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
  202. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
  203. The optional argument `postprocessor` is a function that takes `path`,
  204. `key` and `value` as positional arguments and returns a new `(key, value)`
  205. pair where both `key` and `value` may have changed. Usage example::
  206. >>> def postprocessor(path, key, value):
  207. ... try:
  208. ... return key + ':int', int(value)
  209. ... except (ValueError, TypeError):
  210. ... return key, value
  211. >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
  212. ... postprocessor=postprocessor)
  213. {'a': {'b:int': [1, 2], 'b': 'x'}}
  214. You can pass an alternate version of `expat` (such as `defusedexpat`) by
  215. using the `expat` parameter. E.g:
  216. >>> import defusedexpat
  217. >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
  218. {'a': 'hello'}
  219. You can use the force_list argument to force lists to be created even
  220. when there is only a single child of a given level of hierarchy. The
  221. force_list argument is a tuple of keys. If the key for a given level
  222. of hierarchy is in the force_list argument, that level of hierarchy
  223. will have a list as a child (even if there is only one sub-element).
  224. The index_keys operation takes precedence over this. This is applied
  225. after any user-supplied postprocessor has already run.
  226. For example, given this input:
  227. <servers>
  228. <server>
  229. <name>host1</name>
  230. <os>Linux</os>
  231. <interfaces>
  232. <interface>
  233. <name>em0</name>
  234. <ip_address>10.0.0.1</ip_address>
  235. </interface>
  236. </interfaces>
  237. </server>
  238. </servers>
  239. If called with force_list=('interface',), it will produce
  240. this dictionary:
  241. {'servers':
  242. {'server':
  243. {'name': 'host1',
  244. 'os': 'Linux'},
  245. 'interfaces':
  246. {'interface':
  247. [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
  248. `force_list` can also be a callable that receives `path`, `key` and
  249. `value`. This is helpful in cases where the logic that decides whether
  250. a list should be forced is more complex.
  251. If `process_comment` is `True` then comment will be added with comment_key
  252. (default=`'#comment'`) to then tag which contains comment
  253. For example, given this input:
  254. <a>
  255. <b>
  256. <!-- b comment -->
  257. <c>
  258. <!-- c comment -->
  259. 1
  260. </c>
  261. <d>2</d>
  262. </b>
  263. </a>
  264. If called with process_comment=True, it will produce
  265. this dictionary:
  266. 'a': {
  267. 'b': {
  268. '#comment': 'b comment',
  269. 'c': {
  270. '#comment': 'c comment',
  271. '#text': '1',
  272. },
  273. 'd': '2',
  274. },
  275. }
  276. """
  277. handler = _DictSAXHandler(namespace_separator=namespace_separator,
  278. **kwargs)
  279. if isinstance(xml_input, str):
  280. encoding = encoding or 'utf-8'
  281. xml_input = xml_input.encode(encoding)
  282. if not process_namespaces:
  283. namespace_separator = None
  284. parser = expat.ParserCreate(
  285. encoding,
  286. namespace_separator
  287. )
  288. try:
  289. parser.ordered_attributes = True
  290. except AttributeError:
  291. # Jython's expat does not support ordered_attributes
  292. pass
  293. parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
  294. parser.StartElementHandler = handler.startElement
  295. parser.EndElementHandler = handler.endElement
  296. parser.CharacterDataHandler = handler.characters
  297. if process_comments:
  298. parser.CommentHandler = handler.comments
  299. parser.buffer_text = True
  300. if disable_entities:
  301. try:
  302. # Attempt to disable DTD in Jython's expat parser (Xerces-J).
  303. feature = "http://apache.org/xml/features/disallow-doctype-decl"
  304. parser._reader.setFeature(feature, True)
  305. except AttributeError:
  306. # For CPython / expat parser.
  307. # Anything not handled ends up here and entities aren't expanded.
  308. parser.DefaultHandler = lambda x: None
  309. # Expects an integer return; zero means failure -> expat.ExpatError.
  310. parser.ExternalEntityRefHandler = lambda *x: 1
  311. if hasattr(xml_input, 'read'):
  312. parser.ParseFile(xml_input)
  313. elif isgenerator(xml_input):
  314. for chunk in xml_input:
  315. parser.Parse(chunk, False)
  316. parser.Parse(b'', True)
  317. else:
  318. parser.Parse(xml_input, True)
  319. return handler.item
  320. def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'):
  321. if not namespaces:
  322. return name
  323. try:
  324. ns, name = name.rsplit(ns_sep, 1)
  325. except ValueError:
  326. pass
  327. else:
  328. ns_res = namespaces.get(ns.strip(attr_prefix))
  329. name = '{}{}{}{}'.format(
  330. attr_prefix if ns.startswith(attr_prefix) else '',
  331. ns_res, ns_sep, name) if ns_res else name
  332. return name
  333. def _emit(key, value, content_handler,
  334. attr_prefix='@',
  335. cdata_key='#text',
  336. depth=0,
  337. preprocessor=None,
  338. pretty=False,
  339. newl='\n',
  340. indent='\t',
  341. namespace_separator=':',
  342. namespaces=None,
  343. full_document=True,
  344. expand_iter=None):
  345. key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
  346. if preprocessor is not None:
  347. result = preprocessor(key, value)
  348. if result is None:
  349. return
  350. key, value = result
  351. if not hasattr(value, '__iter__') or isinstance(value, (str, dict)):
  352. value = [value]
  353. for index, v in enumerate(value):
  354. if full_document and depth == 0 and index > 0:
  355. raise ValueError('document with multiple roots')
  356. if v is None:
  357. v = _dict()
  358. elif isinstance(v, bool):
  359. v = 'true' if v else 'false'
  360. elif not isinstance(v, (dict, str)):
  361. if expand_iter and hasattr(v, '__iter__'):
  362. v = _dict(((expand_iter, v),))
  363. else:
  364. v = str(v)
  365. if isinstance(v, str):
  366. v = _dict(((cdata_key, v),))
  367. cdata = None
  368. attrs = _dict()
  369. children = []
  370. for ik, iv in v.items():
  371. if ik == cdata_key:
  372. cdata = iv
  373. continue
  374. if ik.startswith(attr_prefix):
  375. ik = _process_namespace(ik, namespaces, namespace_separator,
  376. attr_prefix)
  377. if ik == '@xmlns' and isinstance(iv, dict):
  378. for k, v in iv.items():
  379. attr = 'xmlns{}'.format(f':{k}' if k else '')
  380. attrs[attr] = str(v)
  381. continue
  382. if not isinstance(iv, str):
  383. iv = str(iv)
  384. attrs[ik[len(attr_prefix):]] = iv
  385. continue
  386. children.append((ik, iv))
  387. if isinstance(indent, int):
  388. indent = ' ' * indent
  389. if pretty:
  390. content_handler.ignorableWhitespace(depth * indent)
  391. content_handler.startElement(key, AttributesImpl(attrs))
  392. if pretty and children:
  393. content_handler.ignorableWhitespace(newl)
  394. for child_key, child_value in children:
  395. _emit(child_key, child_value, content_handler,
  396. attr_prefix, cdata_key, depth+1, preprocessor,
  397. pretty, newl, indent, namespaces=namespaces,
  398. namespace_separator=namespace_separator,
  399. expand_iter=expand_iter)
  400. if cdata is not None:
  401. content_handler.characters(cdata)
  402. if pretty and children:
  403. content_handler.ignorableWhitespace(depth * indent)
  404. content_handler.endElement(key)
  405. if pretty and depth:
  406. content_handler.ignorableWhitespace(newl)
  407. def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
  408. short_empty_elements=False,
  409. **kwargs):
  410. """Emit an XML document for the given `input_dict` (reverse of `parse`).
  411. The resulting XML document is returned as a string, but if `output` (a
  412. file-like object) is specified, it is written there instead.
  413. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
  414. as XML node attributes, whereas keys equal to `cdata_key`
  415. (default=`'#text'`) are treated as character data.
  416. The `pretty` parameter (default=`False`) enables pretty-printing. In this
  417. mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
  418. can be customized with the `newl` and `indent` parameters.
  419. """
  420. if full_document and len(input_dict) != 1:
  421. raise ValueError('Document must have exactly one root.')
  422. must_return = False
  423. if output is None:
  424. output = StringIO()
  425. must_return = True
  426. if short_empty_elements:
  427. content_handler = XMLGenerator(output, encoding, True)
  428. else:
  429. content_handler = XMLGenerator(output, encoding)
  430. if full_document:
  431. content_handler.startDocument()
  432. for key, value in input_dict.items():
  433. _emit(key, value, content_handler, full_document=full_document,
  434. **kwargs)
  435. if full_document:
  436. content_handler.endDocument()
  437. if must_return:
  438. value = output.getvalue()
  439. try: # pragma no cover
  440. value = value.decode(encoding)
  441. except AttributeError: # pragma no cover
  442. pass
  443. return value
  444. if __name__ == '__main__': # pragma: no cover
  445. import sys
  446. import marshal
  447. try:
  448. stdin = sys.stdin.buffer
  449. stdout = sys.stdout.buffer
  450. except AttributeError:
  451. stdin = sys.stdin
  452. stdout = sys.stdout
  453. (item_depth,) = sys.argv[1:]
  454. item_depth = int(item_depth)
  455. def handle_item(path, item):
  456. marshal.dump((path, item), stdout)
  457. return True
  458. try:
  459. root = parse(stdin,
  460. item_depth=item_depth,
  461. item_callback=handle_item,
  462. dict_constructor=dict)
  463. if item_depth == 0:
  464. handle_item([], root)
  465. except KeyboardInterrupt:
  466. pass