expatreader.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. """
  2. SAX driver for the pyexpat C module. This driver works with
  3. pyexpat.__version__ == '2.22'.
  4. """
  5. version = "0.20"
  6. from xml.sax._exceptions import *
  7. from xml.sax.handler import feature_validation, feature_namespaces
  8. from xml.sax.handler import feature_namespace_prefixes
  9. from xml.sax.handler import feature_external_ges, feature_external_pes
  10. from xml.sax.handler import feature_string_interning
  11. from xml.sax.handler import property_xml_string, property_interning_dict
  12. try:
  13. from xml.parsers import expat
  14. except ImportError:
  15. raise SAXReaderNotAvailable("expat not supported", None)
  16. else:
  17. if not hasattr(expat, "ParserCreate"):
  18. raise SAXReaderNotAvailable("expat not supported", None)
  19. from xml.sax import xmlreader, saxutils, handler
  20. AttributesImpl = xmlreader.AttributesImpl
  21. AttributesNSImpl = xmlreader.AttributesNSImpl
  22. # If we're using a sufficiently recent version of Python, we can use
  23. # weak references to avoid cycles between the parser and content
  24. # handler, otherwise we'll just have to pretend.
  25. try:
  26. import _weakref
  27. except ImportError:
  28. def _mkproxy(o):
  29. return o
  30. else:
  31. import weakref
  32. _mkproxy = weakref.proxy
  33. del weakref, _weakref
  34. class _ClosedParser:
  35. pass
  36. # --- ExpatLocator
  37. class ExpatLocator(xmlreader.Locator):
  38. """Locator for use with the ExpatParser class.
  39. This uses a weak reference to the parser object to avoid creating
  40. a circular reference between the parser and the content handler.
  41. """
  42. def __init__(self, parser):
  43. self._ref = _mkproxy(parser)
  44. def getColumnNumber(self):
  45. parser = self._ref
  46. if parser._parser is None:
  47. return None
  48. return parser._parser.ErrorColumnNumber
  49. def getLineNumber(self):
  50. parser = self._ref
  51. if parser._parser is None:
  52. return 1
  53. return parser._parser.ErrorLineNumber
  54. def getPublicId(self):
  55. parser = self._ref
  56. if parser is None:
  57. return None
  58. return parser._source.getPublicId()
  59. def getSystemId(self):
  60. parser = self._ref
  61. if parser is None:
  62. return None
  63. return parser._source.getSystemId()
  64. # --- ExpatParser
  65. class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
  66. """SAX driver for the pyexpat C module."""
  67. def __init__(self, namespaceHandling=0, bufsize=2**16-20):
  68. xmlreader.IncrementalParser.__init__(self, bufsize)
  69. self._source = xmlreader.InputSource()
  70. self._parser = None
  71. self._namespaces = namespaceHandling
  72. self._lex_handler_prop = None
  73. self._parsing = False
  74. self._entity_stack = []
  75. self._external_ges = 0
  76. self._interning = None
  77. # XMLReader methods
  78. def parse(self, source):
  79. "Parse an XML document from a URL or an InputSource."
  80. source = saxutils.prepare_input_source(source)
  81. self._source = source
  82. try:
  83. self.reset()
  84. self._cont_handler.setDocumentLocator(ExpatLocator(self))
  85. xmlreader.IncrementalParser.parse(self, source)
  86. except:
  87. # bpo-30264: Close the source on error to not leak resources:
  88. # xml.sax.parse() doesn't give access to the underlying parser
  89. # to the caller
  90. self._close_source()
  91. raise
  92. def prepareParser(self, source):
  93. if source.getSystemId() is not None:
  94. self._parser.SetBase(source.getSystemId())
  95. # Redefined setContentHandler to allow changing handlers during parsing
  96. def setContentHandler(self, handler):
  97. xmlreader.IncrementalParser.setContentHandler(self, handler)
  98. if self._parsing:
  99. self._reset_cont_handler()
  100. def getFeature(self, name):
  101. if name == feature_namespaces:
  102. return self._namespaces
  103. elif name == feature_string_interning:
  104. return self._interning is not None
  105. elif name in (feature_validation, feature_external_pes,
  106. feature_namespace_prefixes):
  107. return 0
  108. elif name == feature_external_ges:
  109. return self._external_ges
  110. raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
  111. def setFeature(self, name, state):
  112. if self._parsing:
  113. raise SAXNotSupportedException("Cannot set features while parsing")
  114. if name == feature_namespaces:
  115. self._namespaces = state
  116. elif name == feature_external_ges:
  117. self._external_ges = state
  118. elif name == feature_string_interning:
  119. if state:
  120. if self._interning is None:
  121. self._interning = {}
  122. else:
  123. self._interning = None
  124. elif name == feature_validation:
  125. if state:
  126. raise SAXNotSupportedException(
  127. "expat does not support validation")
  128. elif name == feature_external_pes:
  129. if state:
  130. raise SAXNotSupportedException(
  131. "expat does not read external parameter entities")
  132. elif name == feature_namespace_prefixes:
  133. if state:
  134. raise SAXNotSupportedException(
  135. "expat does not report namespace prefixes")
  136. else:
  137. raise SAXNotRecognizedException(
  138. "Feature '%s' not recognized" % name)
  139. def getProperty(self, name):
  140. if name == handler.property_lexical_handler:
  141. return self._lex_handler_prop
  142. elif name == property_interning_dict:
  143. return self._interning
  144. elif name == property_xml_string:
  145. if self._parser:
  146. if hasattr(self._parser, "GetInputContext"):
  147. return self._parser.GetInputContext()
  148. else:
  149. raise SAXNotRecognizedException(
  150. "This version of expat does not support getting"
  151. " the XML string")
  152. else:
  153. raise SAXNotSupportedException(
  154. "XML string cannot be returned when not parsing")
  155. raise SAXNotRecognizedException("Property '%s' not recognized" % name)
  156. def setProperty(self, name, value):
  157. if name == handler.property_lexical_handler:
  158. self._lex_handler_prop = value
  159. if self._parsing:
  160. self._reset_lex_handler_prop()
  161. elif name == property_interning_dict:
  162. self._interning = value
  163. elif name == property_xml_string:
  164. raise SAXNotSupportedException("Property '%s' cannot be set" %
  165. name)
  166. else:
  167. raise SAXNotRecognizedException("Property '%s' not recognized" %
  168. name)
  169. # IncrementalParser methods
  170. def feed(self, data, isFinal=False):
  171. if not self._parsing:
  172. self.reset()
  173. self._parsing = True
  174. self._cont_handler.startDocument()
  175. try:
  176. # The isFinal parameter is internal to the expat reader.
  177. # If it is set to true, expat will check validity of the entire
  178. # document. When feeding chunks, they are not normally final -
  179. # except when invoked from close.
  180. self._parser.Parse(data, isFinal)
  181. except expat.error as e:
  182. exc = SAXParseException(expat.ErrorString(e.code), e, self)
  183. # FIXME: when to invoke error()?
  184. self._err_handler.fatalError(exc)
  185. def flush(self):
  186. if self._parser is None:
  187. return
  188. was_enabled = self._parser.GetReparseDeferralEnabled()
  189. try:
  190. self._parser.SetReparseDeferralEnabled(False)
  191. self._parser.Parse(b"", False)
  192. except expat.error as e:
  193. exc = SAXParseException(expat.ErrorString(e.code), e, self)
  194. self._err_handler.fatalError(exc)
  195. finally:
  196. self._parser.SetReparseDeferralEnabled(was_enabled)
  197. def _close_source(self):
  198. source = self._source
  199. try:
  200. file = source.getCharacterStream()
  201. if file is not None:
  202. file.close()
  203. finally:
  204. file = source.getByteStream()
  205. if file is not None:
  206. file.close()
  207. def close(self):
  208. if (self._entity_stack or self._parser is None or
  209. isinstance(self._parser, _ClosedParser)):
  210. # If we are completing an external entity, do nothing here
  211. return
  212. try:
  213. self.feed(b"", isFinal=True)
  214. self._cont_handler.endDocument()
  215. self._parsing = False
  216. # break cycle created by expat handlers pointing to our methods
  217. self._parser = None
  218. finally:
  219. self._parsing = False
  220. if self._parser is not None:
  221. # Keep ErrorColumnNumber and ErrorLineNumber after closing.
  222. parser = _ClosedParser()
  223. parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
  224. parser.ErrorLineNumber = self._parser.ErrorLineNumber
  225. self._parser = parser
  226. self._close_source()
  227. def _reset_cont_handler(self):
  228. self._parser.ProcessingInstructionHandler = \
  229. self._cont_handler.processingInstruction
  230. self._parser.CharacterDataHandler = self._cont_handler.characters
  231. def _reset_lex_handler_prop(self):
  232. lex = self._lex_handler_prop
  233. parser = self._parser
  234. if lex is None:
  235. parser.CommentHandler = None
  236. parser.StartCdataSectionHandler = None
  237. parser.EndCdataSectionHandler = None
  238. parser.StartDoctypeDeclHandler = None
  239. parser.EndDoctypeDeclHandler = None
  240. else:
  241. parser.CommentHandler = lex.comment
  242. parser.StartCdataSectionHandler = lex.startCDATA
  243. parser.EndCdataSectionHandler = lex.endCDATA
  244. parser.StartDoctypeDeclHandler = self.start_doctype_decl
  245. parser.EndDoctypeDeclHandler = lex.endDTD
  246. def reset(self):
  247. if self._namespaces:
  248. self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
  249. intern=self._interning)
  250. self._parser.namespace_prefixes = 1
  251. self._parser.StartElementHandler = self.start_element_ns
  252. self._parser.EndElementHandler = self.end_element_ns
  253. else:
  254. self._parser = expat.ParserCreate(self._source.getEncoding(),
  255. intern = self._interning)
  256. self._parser.StartElementHandler = self.start_element
  257. self._parser.EndElementHandler = self.end_element
  258. self._reset_cont_handler()
  259. self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
  260. self._parser.NotationDeclHandler = self.notation_decl
  261. self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
  262. self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
  263. self._decl_handler_prop = None
  264. if self._lex_handler_prop:
  265. self._reset_lex_handler_prop()
  266. # self._parser.DefaultHandler =
  267. # self._parser.DefaultHandlerExpand =
  268. # self._parser.NotStandaloneHandler =
  269. self._parser.ExternalEntityRefHandler = self.external_entity_ref
  270. try:
  271. self._parser.SkippedEntityHandler = self.skipped_entity_handler
  272. except AttributeError:
  273. # This pyexpat does not support SkippedEntity
  274. pass
  275. self._parser.SetParamEntityParsing(
  276. expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
  277. self._parsing = False
  278. self._entity_stack = []
  279. # Locator methods
  280. def getColumnNumber(self):
  281. if self._parser is None:
  282. return None
  283. return self._parser.ErrorColumnNumber
  284. def getLineNumber(self):
  285. if self._parser is None:
  286. return 1
  287. return self._parser.ErrorLineNumber
  288. def getPublicId(self):
  289. return self._source.getPublicId()
  290. def getSystemId(self):
  291. return self._source.getSystemId()
  292. # event handlers
  293. def start_element(self, name, attrs):
  294. self._cont_handler.startElement(name, AttributesImpl(attrs))
  295. def end_element(self, name):
  296. self._cont_handler.endElement(name)
  297. def start_element_ns(self, name, attrs):
  298. pair = name.split()
  299. if len(pair) == 1:
  300. # no namespace
  301. pair = (None, name)
  302. elif len(pair) == 3:
  303. pair = pair[0], pair[1]
  304. else:
  305. # default namespace
  306. pair = tuple(pair)
  307. newattrs = {}
  308. qnames = {}
  309. for (aname, value) in attrs.items():
  310. parts = aname.split()
  311. length = len(parts)
  312. if length == 1:
  313. # no namespace
  314. qname = aname
  315. apair = (None, aname)
  316. elif length == 3:
  317. qname = "%s:%s" % (parts[2], parts[1])
  318. apair = parts[0], parts[1]
  319. else:
  320. # default namespace
  321. qname = parts[1]
  322. apair = tuple(parts)
  323. newattrs[apair] = value
  324. qnames[apair] = qname
  325. self._cont_handler.startElementNS(pair, None,
  326. AttributesNSImpl(newattrs, qnames))
  327. def end_element_ns(self, name):
  328. pair = name.split()
  329. if len(pair) == 1:
  330. pair = (None, name)
  331. elif len(pair) == 3:
  332. pair = pair[0], pair[1]
  333. else:
  334. pair = tuple(pair)
  335. self._cont_handler.endElementNS(pair, None)
  336. # this is not used (call directly to ContentHandler)
  337. def processing_instruction(self, target, data):
  338. self._cont_handler.processingInstruction(target, data)
  339. # this is not used (call directly to ContentHandler)
  340. def character_data(self, data):
  341. self._cont_handler.characters(data)
  342. def start_namespace_decl(self, prefix, uri):
  343. self._cont_handler.startPrefixMapping(prefix, uri)
  344. def end_namespace_decl(self, prefix):
  345. self._cont_handler.endPrefixMapping(prefix)
  346. def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
  347. self._lex_handler_prop.startDTD(name, pubid, sysid)
  348. def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
  349. self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
  350. def notation_decl(self, name, base, sysid, pubid):
  351. self._dtd_handler.notationDecl(name, pubid, sysid)
  352. def external_entity_ref(self, context, base, sysid, pubid):
  353. if not self._external_ges:
  354. return 1
  355. source = self._ent_handler.resolveEntity(pubid, sysid)
  356. source = saxutils.prepare_input_source(source,
  357. self._source.getSystemId() or
  358. "")
  359. self._entity_stack.append((self._parser, self._source))
  360. self._parser = self._parser.ExternalEntityParserCreate(context)
  361. self._source = source
  362. try:
  363. xmlreader.IncrementalParser.parse(self, source)
  364. except:
  365. return 0 # FIXME: save error info here?
  366. (self._parser, self._source) = self._entity_stack[-1]
  367. del self._entity_stack[-1]
  368. return 1
  369. def skipped_entity_handler(self, name, is_pe):
  370. if is_pe:
  371. # The SAX spec requires to report skipped PEs with a '%'
  372. name = '%'+name
  373. self._cont_handler.skippedEntity(name)
  374. # ---
  375. def create_parser(*args, **kwargs):
  376. return ExpatParser(*args, **kwargs)
  377. # ---
  378. if __name__ == "__main__":
  379. import xml.sax.saxutils
  380. p = create_parser()
  381. p.setContentHandler(xml.sax.saxutils.XMLGenerator())
  382. p.setErrorHandler(xml.sax.ErrorHandler())
  383. p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")