123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544 |
- #!/usr/bin/env python
- "Makes working with XML feel like you are working with JSON"
- try:
- from defusedexpat import pyexpat as expat
- except ImportError:
- from xml.parsers import expat
- from xml.sax.saxutils import XMLGenerator
- from xml.sax.xmlreader import AttributesImpl
- try: # pragma no cover
- from cStringIO import StringIO
- except ImportError: # pragma no cover
- try:
- from StringIO import StringIO
- except ImportError:
- from io import StringIO
- _dict = dict
- import platform
- if tuple(map(int, platform.python_version_tuple()[:2])) < (3, 7):
- from collections import OrderedDict as _dict
- from inspect import isgenerator
- try: # pragma no cover
- _basestring = basestring
- except NameError: # pragma no cover
- _basestring = str
- try: # pragma no cover
- _unicode = unicode
- except NameError: # pragma no cover
- _unicode = str
- __author__ = 'Martin Blech'
- __version__ = '0.13.0'
- __license__ = 'MIT'
- class ParsingInterrupted(Exception):
- pass
- class _DictSAXHandler(object):
- def __init__(self,
- item_depth=0,
- item_callback=lambda *args: True,
- xml_attribs=True,
- attr_prefix='@',
- cdata_key='#text',
- force_cdata=False,
- cdata_separator='',
- postprocessor=None,
- dict_constructor=_dict,
- strip_whitespace=True,
- namespace_separator=':',
- namespaces=None,
- force_list=None,
- comment_key='#comment'):
- self.path = []
- self.stack = []
- self.data = []
- self.item = None
- self.item_depth = item_depth
- self.xml_attribs = xml_attribs
- self.item_callback = item_callback
- self.attr_prefix = attr_prefix
- self.cdata_key = cdata_key
- self.force_cdata = force_cdata
- self.cdata_separator = cdata_separator
- self.postprocessor = postprocessor
- self.dict_constructor = dict_constructor
- self.strip_whitespace = strip_whitespace
- self.namespace_separator = namespace_separator
- self.namespaces = namespaces
- self.namespace_declarations = dict_constructor()
- self.force_list = force_list
- self.comment_key = comment_key
- def _build_name(self, full_name):
- if self.namespaces is None:
- return full_name
- i = full_name.rfind(self.namespace_separator)
- if i == -1:
- return full_name
- namespace, name = full_name[:i], full_name[i+1:]
- try:
- short_namespace = self.namespaces[namespace]
- except KeyError:
- short_namespace = namespace
- if not short_namespace:
- return name
- else:
- return self.namespace_separator.join((short_namespace, name))
- def _attrs_to_dict(self, attrs):
- if isinstance(attrs, dict):
- return attrs
- return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
- def startNamespaceDecl(self, prefix, uri):
- self.namespace_declarations[prefix or ''] = uri
- def startElement(self, full_name, attrs):
- name = self._build_name(full_name)
- attrs = self._attrs_to_dict(attrs)
- if attrs and self.namespace_declarations:
- attrs['xmlns'] = self.namespace_declarations
- self.namespace_declarations = self.dict_constructor()
- self.path.append((name, attrs or None))
- if len(self.path) > self.item_depth:
- self.stack.append((self.item, self.data))
- if self.xml_attribs:
- attr_entries = []
- for key, value in attrs.items():
- key = self.attr_prefix+self._build_name(key)
- if self.postprocessor:
- entry = self.postprocessor(self.path, key, value)
- else:
- entry = (key, value)
- if entry:
- attr_entries.append(entry)
- attrs = self.dict_constructor(attr_entries)
- else:
- attrs = None
- self.item = attrs or None
- self.data = []
- def endElement(self, full_name):
- name = self._build_name(full_name)
- if len(self.path) == self.item_depth:
- item = self.item
- if item is None:
- item = (None if not self.data
- else self.cdata_separator.join(self.data))
- should_continue = self.item_callback(self.path, item)
- if not should_continue:
- raise ParsingInterrupted()
- if self.stack:
- data = (None if not self.data
- else self.cdata_separator.join(self.data))
- item = self.item
- self.item, self.data = self.stack.pop()
- if self.strip_whitespace and data:
- data = data.strip() or None
- if data and self.force_cdata and item is None:
- item = self.dict_constructor()
- if item is not None:
- if data:
- self.push_data(item, self.cdata_key, data)
- self.item = self.push_data(self.item, name, item)
- else:
- self.item = self.push_data(self.item, name, data)
- else:
- self.item = None
- self.data = []
- self.path.pop()
- def characters(self, data):
- if not self.data:
- self.data = [data]
- else:
- self.data.append(data)
- def comments(self, data):
- if self.strip_whitespace:
- data = data.strip()
- self.item = self.push_data(self.item, self.comment_key, data)
- def push_data(self, item, key, data):
- if self.postprocessor is not None:
- result = self.postprocessor(self.path, key, data)
- if result is None:
- return item
- key, data = result
- if item is None:
- item = self.dict_constructor()
- try:
- value = item[key]
- if isinstance(value, list):
- value.append(data)
- else:
- item[key] = [value, data]
- except KeyError:
- if self._should_force_list(key, data):
- item[key] = [data]
- else:
- item[key] = data
- return item
- def _should_force_list(self, key, value):
- if not self.force_list:
- return False
- if isinstance(self.force_list, bool):
- return self.force_list
- try:
- return key in self.force_list
- except TypeError:
- return self.force_list(self.path[:-1], key, value)
- def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
- namespace_separator=':', disable_entities=True, process_comments=False, **kwargs):
- """Parse the given XML input and convert it into a dictionary.
- `xml_input` can either be a `string`, a file-like object, or a generator of strings.
- If `xml_attribs` is `True`, element attributes are put in the dictionary
- among regular child elements, using `@` as a prefix to avoid collisions. If
- set to `False`, they are just ignored.
- Simple example::
- >>> import xmltodict
- >>> doc = xmltodict.parse(\"\"\"
- ... <a prop="x">
- ... <b>1</b>
- ... <b>2</b>
- ... </a>
- ... \"\"\")
- >>> doc['a']['@prop']
- u'x'
- >>> doc['a']['b']
- [u'1', u'2']
- If `item_depth` is `0`, the function returns a dictionary for the root
- element (default behavior). Otherwise, it calls `item_callback` every time
- an item at the specified depth is found and returns `None` in the end
- (streaming mode).
- The callback function receives two parameters: the `path` from the document
- root to the item (name-attribs pairs), and the `item` (dict). If the
- callback's return value is false-ish, parsing will be stopped with the
- :class:`ParsingInterrupted` exception.
- Streaming example::
- >>> def handle(path, item):
- ... print('path:%s item:%s' % (path, item))
- ... return True
- ...
- >>> xmltodict.parse(\"\"\"
- ... <a prop="x">
- ... <b>1</b>
- ... <b>2</b>
- ... </a>\"\"\", item_depth=2, item_callback=handle)
- path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
- path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
- The optional argument `postprocessor` is a function that takes `path`,
- `key` and `value` as positional arguments and returns a new `(key, value)`
- pair where both `key` and `value` may have changed. Usage example::
- >>> def postprocessor(path, key, value):
- ... try:
- ... return key + ':int', int(value)
- ... except (ValueError, TypeError):
- ... return key, value
- >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
- ... postprocessor=postprocessor)
- {'a': {'b:int': [1, 2], 'b': 'x'}}
- You can pass an alternate version of `expat` (such as `defusedexpat`) by
- using the `expat` parameter. E.g:
- >>> import defusedexpat
- >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
- {'a': 'hello'}
- You can use the force_list argument to force lists to be created even
- when there is only a single child of a given level of hierarchy. The
- force_list argument is a tuple of keys. If the key for a given level
- of hierarchy is in the force_list argument, that level of hierarchy
- will have a list as a child (even if there is only one sub-element).
- The index_keys operation takes precedence over this. This is applied
- after any user-supplied postprocessor has already run.
- For example, given this input:
- <servers>
- <server>
- <name>host1</name>
- <os>Linux</os>
- <interfaces>
- <interface>
- <name>em0</name>
- <ip_address>10.0.0.1</ip_address>
- </interface>
- </interfaces>
- </server>
- </servers>
- If called with force_list=('interface',), it will produce
- this dictionary:
- {'servers':
- {'server':
- {'name': 'host1',
- 'os': 'Linux'},
- 'interfaces':
- {'interface':
- [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
- `force_list` can also be a callable that receives `path`, `key` and
- `value`. This is helpful in cases where the logic that decides whether
- a list should be forced is more complex.
- If `process_comment` is `True` then comment will be added with comment_key
- (default=`'#comment'`) to then tag which contains comment
- For example, given this input:
- <a>
- <b>
- <!-- b comment -->
- <c>
- <!-- c comment -->
- 1
- </c>
- <d>2</d>
- </b>
- </a>
- If called with process_comment=True, it will produce
- this dictionary:
- 'a': {
- 'b': {
- '#comment': 'b comment',
- 'c': {
- '#comment': 'c comment',
- '#text': '1',
- },
- 'd': '2',
- },
- }
- """
- handler = _DictSAXHandler(namespace_separator=namespace_separator,
- **kwargs)
- if isinstance(xml_input, _unicode):
- if not encoding:
- encoding = 'utf-8'
- xml_input = xml_input.encode(encoding)
- if not process_namespaces:
- namespace_separator = None
- parser = expat.ParserCreate(
- encoding,
- namespace_separator
- )
- try:
- parser.ordered_attributes = True
- except AttributeError:
- # Jython's expat does not support ordered_attributes
- pass
- parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
- parser.StartElementHandler = handler.startElement
- parser.EndElementHandler = handler.endElement
- parser.CharacterDataHandler = handler.characters
- if process_comments:
- parser.CommentHandler = handler.comments
- parser.buffer_text = True
- if disable_entities:
- try:
- # Attempt to disable DTD in Jython's expat parser (Xerces-J).
- feature = "http://apache.org/xml/features/disallow-doctype-decl"
- parser._reader.setFeature(feature, True)
- except AttributeError:
- # For CPython / expat parser.
- # Anything not handled ends up here and entities aren't expanded.
- parser.DefaultHandler = lambda x: None
- # Expects an integer return; zero means failure -> expat.ExpatError.
- parser.ExternalEntityRefHandler = lambda *x: 1
- if hasattr(xml_input, 'read'):
- parser.ParseFile(xml_input)
- elif isgenerator(xml_input):
- for chunk in xml_input:
- parser.Parse(chunk,False)
- parser.Parse(b'',True)
- else:
- parser.Parse(xml_input, True)
- return handler.item
- def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'):
- if not namespaces:
- return name
- try:
- ns, name = name.rsplit(ns_sep, 1)
- except ValueError:
- pass
- else:
- ns_res = namespaces.get(ns.strip(attr_prefix))
- name = '{}{}{}{}'.format(
- attr_prefix if ns.startswith(attr_prefix) else '',
- ns_res, ns_sep, name) if ns_res else name
- return name
- def _emit(key, value, content_handler,
- attr_prefix='@',
- cdata_key='#text',
- depth=0,
- preprocessor=None,
- pretty=False,
- newl='\n',
- indent='\t',
- namespace_separator=':',
- namespaces=None,
- full_document=True,
- expand_iter=None):
- key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
- if preprocessor is not None:
- result = preprocessor(key, value)
- if result is None:
- return
- key, value = result
- if (not hasattr(value, '__iter__')
- or isinstance(value, _basestring)
- or isinstance(value, dict)):
- value = [value]
- for index, v in enumerate(value):
- if full_document and depth == 0 and index > 0:
- raise ValueError('document with multiple roots')
- if v is None:
- v = _dict()
- elif isinstance(v, bool):
- if v:
- v = _unicode('true')
- else:
- v = _unicode('false')
- elif not isinstance(v, dict):
- if expand_iter and hasattr(v, '__iter__') and not isinstance(v, _basestring):
- v = _dict(((expand_iter, v),))
- else:
- v = _unicode(v)
- if isinstance(v, _basestring):
- v = _dict(((cdata_key, v),))
- cdata = None
- attrs = _dict()
- children = []
- for ik, iv in v.items():
- if ik == cdata_key:
- cdata = iv
- continue
- if ik.startswith(attr_prefix):
- ik = _process_namespace(ik, namespaces, namespace_separator,
- attr_prefix)
- if ik == '@xmlns' and isinstance(iv, dict):
- for k, v in iv.items():
- attr = 'xmlns{}'.format(':{}'.format(k) if k else '')
- attrs[attr] = _unicode(v)
- continue
- if not isinstance(iv, _unicode):
- iv = _unicode(iv)
- attrs[ik[len(attr_prefix):]] = iv
- continue
- children.append((ik, iv))
- if pretty:
- content_handler.ignorableWhitespace(depth * indent)
- content_handler.startElement(key, AttributesImpl(attrs))
- if pretty and children:
- content_handler.ignorableWhitespace(newl)
- for child_key, child_value in children:
- _emit(child_key, child_value, content_handler,
- attr_prefix, cdata_key, depth+1, preprocessor,
- pretty, newl, indent, namespaces=namespaces,
- namespace_separator=namespace_separator,
- expand_iter=expand_iter)
- if cdata is not None:
- content_handler.characters(cdata)
- if pretty and children:
- content_handler.ignorableWhitespace(depth * indent)
- content_handler.endElement(key)
- if pretty and depth:
- content_handler.ignorableWhitespace(newl)
- def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
- short_empty_elements=False,
- **kwargs):
- """Emit an XML document for the given `input_dict` (reverse of `parse`).
- The resulting XML document is returned as a string, but if `output` (a
- file-like object) is specified, it is written there instead.
- Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
- as XML node attributes, whereas keys equal to `cdata_key`
- (default=`'#text'`) are treated as character data.
- The `pretty` parameter (default=`False`) enables pretty-printing. In this
- mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
- can be customized with the `newl` and `indent` parameters.
- """
- if full_document and len(input_dict) != 1:
- raise ValueError('Document must have exactly one root.')
- must_return = False
- if output is None:
- output = StringIO()
- must_return = True
- if short_empty_elements:
- content_handler = XMLGenerator(output, encoding, True)
- else:
- content_handler = XMLGenerator(output, encoding)
- if full_document:
- content_handler.startDocument()
- for key, value in input_dict.items():
- _emit(key, value, content_handler, full_document=full_document,
- **kwargs)
- if full_document:
- content_handler.endDocument()
- if must_return:
- value = output.getvalue()
- try: # pragma no cover
- value = value.decode(encoding)
- except AttributeError: # pragma no cover
- pass
- return value
- if __name__ == '__main__': # pragma: no cover
- import sys
- import marshal
- try:
- stdin = sys.stdin.buffer
- stdout = sys.stdout.buffer
- except AttributeError:
- stdin = sys.stdin
- stdout = sys.stdout
- (item_depth,) = sys.argv[1:]
- item_depth = int(item_depth)
- def handle_item(path, item):
- marshal.dump((path, item), stdout)
- return True
- try:
- root = parse(stdin,
- item_depth=item_depth,
- item_callback=handle_item,
- dict_constructor=dict)
- if item_depth == 0:
- handle_item([], root)
- except KeyboardInterrupt:
- pass
|