sux.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
  1. # -*- test-case-name: twisted.web.test.test_xml -*-
  2. #
  3. # Copyright (c) Twisted Matrix Laboratories.
  4. # See LICENSE for details.
  5. """
  6. *S*mall, *U*ncomplicated *X*ML.
  7. This is a very simple implementation of XML/HTML as a network
  8. protocol. It is not at all clever. Its main features are that it
  9. does not:
  10. - support namespaces
  11. - mung mnemonic entity references
  12. - validate
  13. - perform *any* external actions (such as fetching URLs or writing files)
  14. under *any* circumstances
  15. - has lots and lots of horrible hacks for supporting broken HTML (as an
  16. option, they're not on by default).
  17. """
  18. from twisted.internet.protocol import Protocol
  19. from twisted.python.reflect import prefixedMethodNames
  20. # Elements of the three-tuples in the state table.
  21. BEGIN_HANDLER = 0
  22. DO_HANDLER = 1
  23. END_HANDLER = 2
  24. identChars = ".-_:"
  25. lenientIdentChars = identChars + ";+#/%~"
  26. def nop(*args, **kw):
  27. "Do nothing."
  28. def unionlist(*args):
  29. l = []
  30. for x in args:
  31. l.extend(x)
  32. d = {x: 1 for x in l}
  33. return d.keys()
  34. def zipfndict(*args, **kw):
  35. default = kw.get("default", nop)
  36. d = {}
  37. for key in unionlist(*(fndict.keys() for fndict in args)):
  38. d[key] = tuple(x.get(key, default) for x in args)
  39. return d
  40. def prefixedMethodClassDict(clazz, prefix):
  41. return {
  42. name: getattr(clazz, prefix + name)
  43. for name in prefixedMethodNames(clazz, prefix)
  44. }
  45. def prefixedMethodObjDict(obj, prefix):
  46. return {
  47. name: getattr(obj, prefix + name)
  48. for name in prefixedMethodNames(obj.__class__, prefix)
  49. }
  50. class ParseError(Exception):
  51. def __init__(self, filename, line, col, message):
  52. self.filename = filename
  53. self.line = line
  54. self.col = col
  55. self.message = message
  56. def __str__(self) -> str:
  57. return f"{self.filename}:{self.line}:{self.col}: {self.message}"
  58. class XMLParser(Protocol):
  59. state = None
  60. encodings = None
  61. filename = "<xml />"
  62. beExtremelyLenient = 0
  63. _prepend = None
  64. # _leadingBodyData will sometimes be set before switching to the
  65. # 'bodydata' state, when we "accidentally" read a byte of bodydata
  66. # in a different state.
  67. _leadingBodyData = None
  68. def connectionMade(self):
  69. self.lineno = 1
  70. self.colno = 0
  71. self.encodings = []
  72. def saveMark(self):
  73. """Get the line number and column of the last character parsed"""
  74. # This gets replaced during dataReceived, restored afterwards
  75. return (self.lineno, self.colno)
  76. def _parseError(self, message):
  77. raise ParseError(*((self.filename,) + self.saveMark() + (message,)))
  78. def _buildStateTable(self):
  79. """Return a dictionary of begin, do, end state function tuples"""
  80. # _buildStateTable leaves something to be desired but it does what it
  81. # does.. probably slowly, so I'm doing some evil caching so it doesn't
  82. # get called more than once per class.
  83. stateTable = getattr(self.__class__, "__stateTable", None)
  84. if stateTable is None:
  85. stateTable = self.__class__.__stateTable = zipfndict(
  86. *(
  87. prefixedMethodObjDict(self, prefix)
  88. for prefix in ("begin_", "do_", "end_")
  89. )
  90. )
  91. return stateTable
  92. def _decode(self, data):
  93. if "UTF-16" in self.encodings or "UCS-2" in self.encodings:
  94. assert not len(data) & 1, "UTF-16 must come in pairs for now"
  95. if self._prepend:
  96. data = self._prepend + data
  97. for encoding in self.encodings:
  98. data = str(data, encoding)
  99. return data
  100. def maybeBodyData(self):
  101. if self.endtag:
  102. return "bodydata"
  103. # Get ready for fun! We're going to allow
  104. # <script>if (foo < bar)</script> to work!
  105. # We do this by making everything between <script> and
  106. # </script> a Text
  107. # BUT <script src="foo"> will be special-cased to do regular,
  108. # lenient behavior, because those may not have </script>
  109. # -radix
  110. if self.tagName == "script" and "src" not in self.tagAttributes:
  111. # we do this ourselves rather than having begin_waitforendscript
  112. # because that can get called multiple times and we don't want
  113. # bodydata to get reset other than the first time.
  114. self.begin_bodydata(None)
  115. return "waitforendscript"
  116. return "bodydata"
  117. def dataReceived(self, data):
  118. stateTable = self._buildStateTable()
  119. if not self.state:
  120. # all UTF-16 starts with this string
  121. if data.startswith((b"\xff\xfe", b"\xfe\xff")):
  122. self._prepend = data[0:2]
  123. self.encodings.append("UTF-16")
  124. data = data[2:]
  125. self.state = "begin"
  126. if self.encodings:
  127. data = self._decode(data)
  128. else:
  129. data = data.decode("utf-8")
  130. # bring state, lineno, colno into local scope
  131. lineno, colno = self.lineno, self.colno
  132. curState = self.state
  133. # replace saveMark with a nested scope function
  134. _saveMark = self.saveMark
  135. def saveMark():
  136. return (lineno, colno)
  137. self.saveMark = saveMark
  138. # fetch functions from the stateTable
  139. beginFn, doFn, endFn = stateTable[curState]
  140. try:
  141. for byte in data:
  142. # do newline stuff
  143. if byte == "\n":
  144. lineno += 1
  145. colno = 0
  146. else:
  147. colno += 1
  148. newState = doFn(byte)
  149. if newState is not None and newState != curState:
  150. # this is the endFn from the previous state
  151. endFn()
  152. curState = newState
  153. beginFn, doFn, endFn = stateTable[curState]
  154. beginFn(byte)
  155. finally:
  156. self.saveMark = _saveMark
  157. self.lineno, self.colno = lineno, colno
  158. # state doesn't make sense if there's an exception..
  159. self.state = curState
  160. def connectionLost(self, reason):
  161. """
  162. End the last state we were in.
  163. """
  164. stateTable = self._buildStateTable()
  165. stateTable[self.state][END_HANDLER]()
  166. # state methods
  167. def do_begin(self, byte):
  168. if byte.isspace():
  169. return
  170. if byte != "<":
  171. if self.beExtremelyLenient:
  172. self._leadingBodyData = byte
  173. return "bodydata"
  174. self._parseError(f"First char of document [{byte!r}] wasn't <")
  175. return "tagstart"
  176. def begin_comment(self, byte):
  177. self.commentbuf = ""
  178. def do_comment(self, byte):
  179. self.commentbuf += byte
  180. if self.commentbuf.endswith("-->"):
  181. self.gotComment(self.commentbuf[:-3])
  182. return "bodydata"
  183. def begin_tagstart(self, byte):
  184. self.tagName = "" # name of the tag
  185. self.tagAttributes = {} # attributes of the tag
  186. self.termtag = 0 # is the tag self-terminating
  187. self.endtag = 0
  188. def do_tagstart(self, byte):
  189. if byte.isalnum() or byte in identChars:
  190. self.tagName += byte
  191. if self.tagName == "!--":
  192. return "comment"
  193. elif byte.isspace():
  194. if self.tagName:
  195. if self.endtag:
  196. # properly strict thing to do here is probably to only
  197. # accept whitespace
  198. return "waitforgt"
  199. return "attrs"
  200. else:
  201. self._parseError("Whitespace before tag-name")
  202. elif byte == ">":
  203. if self.endtag:
  204. self.gotTagEnd(self.tagName)
  205. return "bodydata"
  206. else:
  207. self.gotTagStart(self.tagName, {})
  208. return (
  209. (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
  210. )
  211. elif byte == "/":
  212. if self.tagName:
  213. return "afterslash"
  214. else:
  215. self.endtag = 1
  216. elif byte in "!?":
  217. if self.tagName:
  218. if not self.beExtremelyLenient:
  219. self._parseError("Invalid character in tag-name")
  220. else:
  221. self.tagName += byte
  222. self.termtag = 1
  223. elif byte == "[":
  224. if self.tagName == "!":
  225. return "expectcdata"
  226. else:
  227. self._parseError("Invalid '[' in tag-name")
  228. else:
  229. if self.beExtremelyLenient:
  230. self.bodydata = "<"
  231. return "unentity"
  232. self._parseError("Invalid tag character: %r" % byte)
  233. def begin_unentity(self, byte):
  234. self.bodydata += byte
  235. def do_unentity(self, byte):
  236. self.bodydata += byte
  237. return "bodydata"
  238. def end_unentity(self):
  239. self.gotText(self.bodydata)
  240. def begin_expectcdata(self, byte):
  241. self.cdatabuf = byte
  242. def do_expectcdata(self, byte):
  243. self.cdatabuf += byte
  244. cdb = self.cdatabuf
  245. cd = "[CDATA["
  246. if len(cd) > len(cdb):
  247. if cd.startswith(cdb):
  248. return
  249. elif self.beExtremelyLenient:
  250. ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
  251. ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
  252. ## 'em as best I can. this should really be a separate parse
  253. ## state but I don't even have any idea what these _are_.
  254. return "waitforgt"
  255. else:
  256. self._parseError("Mal-formed CDATA header")
  257. if cd == cdb:
  258. self.cdatabuf = ""
  259. return "cdata"
  260. self._parseError("Mal-formed CDATA header")
  261. def do_cdata(self, byte):
  262. self.cdatabuf += byte
  263. if self.cdatabuf.endswith("]]>"):
  264. self.cdatabuf = self.cdatabuf[:-3]
  265. return "bodydata"
  266. def end_cdata(self):
  267. self.gotCData(self.cdatabuf)
  268. self.cdatabuf = ""
  269. def do_attrs(self, byte):
  270. if byte.isalnum() or byte in identChars:
  271. # XXX FIXME really handle !DOCTYPE at some point
  272. if self.tagName == "!DOCTYPE":
  273. return "doctype"
  274. if self.tagName[0] in "!?":
  275. return "waitforgt"
  276. return "attrname"
  277. elif byte.isspace():
  278. return
  279. elif byte == ">":
  280. self.gotTagStart(self.tagName, self.tagAttributes)
  281. return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
  282. elif byte == "/":
  283. return "afterslash"
  284. elif self.beExtremelyLenient:
  285. # discard and move on? Only case I've seen of this so far was:
  286. # <foo bar="baz"">
  287. return
  288. self._parseError("Unexpected character: %r" % byte)
  289. def begin_doctype(self, byte):
  290. self.doctype = byte
  291. def do_doctype(self, byte):
  292. if byte == ">":
  293. return "bodydata"
  294. self.doctype += byte
  295. def end_doctype(self):
  296. self.gotDoctype(self.doctype)
  297. self.doctype = None
  298. def do_waitforgt(self, byte):
  299. if byte == ">":
  300. if self.endtag or not self.beExtremelyLenient:
  301. return "bodydata"
  302. return self.maybeBodyData()
  303. def begin_attrname(self, byte):
  304. self.attrname = byte
  305. self._attrname_termtag = 0
  306. def do_attrname(self, byte):
  307. if byte.isalnum() or byte in identChars:
  308. self.attrname += byte
  309. return
  310. elif byte == "=":
  311. return "beforeattrval"
  312. elif byte.isspace():
  313. return "beforeeq"
  314. elif self.beExtremelyLenient:
  315. if byte in "\"'":
  316. return "attrval"
  317. if byte in lenientIdentChars or byte.isalnum():
  318. self.attrname += byte
  319. return
  320. if byte == "/":
  321. self._attrname_termtag = 1
  322. return
  323. if byte == ">":
  324. self.attrval = "True"
  325. self.tagAttributes[self.attrname] = self.attrval
  326. self.gotTagStart(self.tagName, self.tagAttributes)
  327. if self._attrname_termtag:
  328. self.gotTagEnd(self.tagName)
  329. return "bodydata"
  330. return self.maybeBodyData()
  331. # something is really broken. let's leave this attribute where it
  332. # is and move on to the next thing
  333. return
  334. self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}")
  335. def do_beforeattrval(self, byte):
  336. if byte in "\"'":
  337. return "attrval"
  338. elif byte.isspace():
  339. return
  340. elif self.beExtremelyLenient:
  341. if byte in lenientIdentChars or byte.isalnum():
  342. return "messyattr"
  343. if byte == ">":
  344. self.attrval = "True"
  345. self.tagAttributes[self.attrname] = self.attrval
  346. self.gotTagStart(self.tagName, self.tagAttributes)
  347. return self.maybeBodyData()
  348. if byte == "\\":
  349. # I saw this in actual HTML once:
  350. # <font size=\"3\"><sup>SM</sup></font>
  351. return
  352. self._parseError(
  353. "Invalid initial attribute value: %r; Attribute values must be quoted."
  354. % byte
  355. )
  356. attrname = ""
  357. attrval = ""
  358. def begin_beforeeq(self, byte):
  359. self._beforeeq_termtag = 0
  360. def do_beforeeq(self, byte):
  361. if byte == "=":
  362. return "beforeattrval"
  363. elif byte.isspace():
  364. return
  365. elif self.beExtremelyLenient:
  366. if byte.isalnum() or byte in identChars:
  367. self.attrval = "True"
  368. self.tagAttributes[self.attrname] = self.attrval
  369. return "attrname"
  370. elif byte == ">":
  371. self.attrval = "True"
  372. self.tagAttributes[self.attrname] = self.attrval
  373. self.gotTagStart(self.tagName, self.tagAttributes)
  374. if self._beforeeq_termtag:
  375. self.gotTagEnd(self.tagName)
  376. return "bodydata"
  377. return self.maybeBodyData()
  378. elif byte == "/":
  379. self._beforeeq_termtag = 1
  380. return
  381. self._parseError("Invalid attribute")
  382. def begin_attrval(self, byte):
  383. self.quotetype = byte
  384. self.attrval = ""
  385. def do_attrval(self, byte):
  386. if byte == self.quotetype:
  387. return "attrs"
  388. self.attrval += byte
  389. def end_attrval(self):
  390. self.tagAttributes[self.attrname] = self.attrval
  391. self.attrname = self.attrval = ""
  392. def begin_messyattr(self, byte):
  393. self.attrval = byte
  394. def do_messyattr(self, byte):
  395. if byte.isspace():
  396. return "attrs"
  397. elif byte == ">":
  398. endTag = 0
  399. if self.attrval.endswith("/"):
  400. endTag = 1
  401. self.attrval = self.attrval[:-1]
  402. self.tagAttributes[self.attrname] = self.attrval
  403. self.gotTagStart(self.tagName, self.tagAttributes)
  404. if endTag:
  405. self.gotTagEnd(self.tagName)
  406. return "bodydata"
  407. return self.maybeBodyData()
  408. else:
  409. self.attrval += byte
  410. def end_messyattr(self):
  411. if self.attrval:
  412. self.tagAttributes[self.attrname] = self.attrval
  413. def begin_afterslash(self, byte):
  414. self._after_slash_closed = 0
  415. def do_afterslash(self, byte):
  416. # this state is only after a self-terminating slash, e.g. <foo/>
  417. if self._after_slash_closed:
  418. self._parseError("Mal-formed") # XXX When does this happen??
  419. if byte != ">":
  420. if self.beExtremelyLenient:
  421. return
  422. else:
  423. self._parseError("No data allowed after '/'")
  424. self._after_slash_closed = 1
  425. self.gotTagStart(self.tagName, self.tagAttributes)
  426. self.gotTagEnd(self.tagName)
  427. # don't need maybeBodyData here because there better not be
  428. # any javascript code after a <script/>... we'll see :(
  429. return "bodydata"
  430. def begin_bodydata(self, byte):
  431. if self._leadingBodyData:
  432. self.bodydata = self._leadingBodyData
  433. del self._leadingBodyData
  434. else:
  435. self.bodydata = ""
  436. def do_bodydata(self, byte):
  437. if byte == "<":
  438. return "tagstart"
  439. if byte == "&":
  440. return "entityref"
  441. self.bodydata += byte
  442. def end_bodydata(self):
  443. self.gotText(self.bodydata)
  444. self.bodydata = ""
  445. def do_waitforendscript(self, byte):
  446. if byte == "<":
  447. return "waitscriptendtag"
  448. self.bodydata += byte
  449. def begin_waitscriptendtag(self, byte):
  450. self.temptagdata = ""
  451. self.tagName = ""
  452. self.endtag = 0
  453. def do_waitscriptendtag(self, byte):
  454. # 1 enforce / as first byte read
  455. # 2 enforce following bytes to be subset of "script" until
  456. # tagName == "script"
  457. # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
  458. # 3 spaces can happen anywhere, they're ignored
  459. # e.g. < / script >
  460. # 4 anything else causes all data I've read to be moved to the
  461. # bodydata, and switch back to waitforendscript state
  462. # If it turns out this _isn't_ a </script>, we need to
  463. # remember all the data we've been through so we can append it
  464. # to bodydata
  465. self.temptagdata += byte
  466. # 1
  467. if byte == "/":
  468. self.endtag = True
  469. elif not self.endtag:
  470. self.bodydata += "<" + self.temptagdata
  471. return "waitforendscript"
  472. # 2
  473. elif byte.isalnum() or byte in identChars:
  474. self.tagName += byte
  475. if not "script".startswith(self.tagName):
  476. self.bodydata += "<" + self.temptagdata
  477. return "waitforendscript"
  478. elif self.tagName == "script":
  479. self.gotText(self.bodydata)
  480. self.gotTagEnd(self.tagName)
  481. return "waitforgt"
  482. # 3
  483. elif byte.isspace():
  484. return "waitscriptendtag"
  485. # 4
  486. else:
  487. self.bodydata += "<" + self.temptagdata
  488. return "waitforendscript"
  489. def begin_entityref(self, byte):
  490. self.erefbuf = ""
  491. self.erefextra = "" # extra bit for lenient mode
  492. def do_entityref(self, byte):
  493. if byte.isspace() or byte == "<":
  494. if self.beExtremelyLenient:
  495. # '&foo' probably was '&amp;foo'
  496. if self.erefbuf and self.erefbuf != "amp":
  497. self.erefextra = self.erefbuf
  498. self.erefbuf = "amp"
  499. if byte == "<":
  500. return "tagstart"
  501. else:
  502. self.erefextra += byte
  503. return "spacebodydata"
  504. self._parseError("Bad entity reference")
  505. elif byte != ";":
  506. self.erefbuf += byte
  507. else:
  508. return "bodydata"
  509. def end_entityref(self):
  510. self.gotEntityReference(self.erefbuf)
  511. # hacky support for space after & in entityref in beExtremelyLenient
  512. # state should only happen in that case
  513. def begin_spacebodydata(self, byte):
  514. self.bodydata = self.erefextra
  515. self.erefextra = None
  516. do_spacebodydata = do_bodydata
  517. end_spacebodydata = end_bodydata
  518. # Sorta SAX-ish API
  519. def gotTagStart(self, name, attributes):
  520. """Encountered an opening tag.
  521. Default behaviour is to print."""
  522. print("begin", name, attributes)
  523. def gotText(self, data):
  524. """Encountered text
  525. Default behaviour is to print."""
  526. print("text:", repr(data))
  527. def gotEntityReference(self, entityRef):
  528. """Encountered mnemonic entity reference
  529. Default behaviour is to print."""
  530. print("entityRef: &%s;" % entityRef)
  531. def gotComment(self, comment):
  532. """Encountered comment.
  533. Default behaviour is to ignore."""
  534. pass
  535. def gotCData(self, cdata):
  536. """Encountered CDATA
  537. Default behaviour is to call the gotText method"""
  538. self.gotText(cdata)
  539. def gotDoctype(self, doctype):
  540. """Encountered DOCTYPE
  541. This is really grotty: it basically just gives you everything between
  542. '<!DOCTYPE' and '>' as an argument.
  543. """
  544. print("!DOCTYPE", repr(doctype))
  545. def gotTagEnd(self, name):
  546. """Encountered closing tag
  547. Default behaviour is to print."""
  548. print("end", name)