sux.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. # -*- test-case-name: twisted.web.test.test_xml -*-
  2. #
  3. # Copyright (c) Twisted Matrix Laboratories.
  4. # See LICENSE for details.
  5. """
  6. *S*mall, *U*ncomplicated *X*ML.
  7. This is a very simple implementation of XML/HTML as a network
  8. protocol. It is not at all clever. Its main features are that it
  9. does not:
  10. - support namespaces
  11. - mung mnemonic entity references
  12. - validate
  13. - perform *any* external actions (such as fetching URLs or writing files)
  14. under *any* circumstances
  15. - has lots and lots of horrible hacks for supporting broken HTML (as an
  16. option, they're not on by default).
  17. """
  18. from __future__ import print_function
  19. from twisted.internet.protocol import Protocol
  20. from twisted.python.compat import unicode
  21. from twisted.python.reflect import prefixedMethodNames
  22. # Elements of the three-tuples in the state table.
  23. BEGIN_HANDLER = 0
  24. DO_HANDLER = 1
  25. END_HANDLER = 2
  26. identChars = '.-_:'
  27. lenientIdentChars = identChars + ';+#/%~'
  28. def nop(*args, **kw):
  29. "Do nothing."
  30. def unionlist(*args):
  31. l = []
  32. for x in args:
  33. l.extend(x)
  34. d = dict([(x, 1) for x in l])
  35. return d.keys()
  36. def zipfndict(*args, **kw):
  37. default = kw.get('default', nop)
  38. d = {}
  39. for key in unionlist(*[fndict.keys() for fndict in args]):
  40. d[key] = tuple([x.get(key, default) for x in args])
  41. return d
  42. def prefixedMethodClassDict(clazz, prefix):
  43. return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
  44. def prefixedMethodObjDict(obj, prefix):
  45. return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
  46. class ParseError(Exception):
  47. def __init__(self, filename, line, col, message):
  48. self.filename = filename
  49. self.line = line
  50. self.col = col
  51. self.message = message
  52. def __str__(self):
  53. return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
  54. self.message)
  55. class XMLParser(Protocol):
  56. state = None
  57. encodings = None
  58. filename = "<xml />"
  59. beExtremelyLenient = 0
  60. _prepend = None
  61. # _leadingBodyData will sometimes be set before switching to the
  62. # 'bodydata' state, when we "accidentally" read a byte of bodydata
  63. # in a different state.
  64. _leadingBodyData = None
  65. def connectionMade(self):
  66. self.lineno = 1
  67. self.colno = 0
  68. self.encodings = []
  69. def saveMark(self):
  70. '''Get the line number and column of the last character parsed'''
  71. # This gets replaced during dataReceived, restored afterwards
  72. return (self.lineno, self.colno)
  73. def _parseError(self, message):
  74. raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
  75. def _buildStateTable(self):
  76. '''Return a dictionary of begin, do, end state function tuples'''
  77. # _buildStateTable leaves something to be desired but it does what it
  78. # does.. probably slowly, so I'm doing some evil caching so it doesn't
  79. # get called more than once per class.
  80. stateTable = getattr(self.__class__, '__stateTable', None)
  81. if stateTable is None:
  82. stateTable = self.__class__.__stateTable = zipfndict(
  83. *[prefixedMethodObjDict(self, prefix)
  84. for prefix in ('begin_', 'do_', 'end_')])
  85. return stateTable
  86. def _decode(self, data):
  87. if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
  88. assert not len(data) & 1, 'UTF-16 must come in pairs for now'
  89. if self._prepend:
  90. data = self._prepend + data
  91. for encoding in self.encodings:
  92. data = unicode(data, encoding)
  93. return data
  94. def maybeBodyData(self):
  95. if self.endtag:
  96. return 'bodydata'
  97. # Get ready for fun! We're going to allow
  98. # <script>if (foo < bar)</script> to work!
  99. # We do this by making everything between <script> and
  100. # </script> a Text
  101. # BUT <script src="foo"> will be special-cased to do regular,
  102. # lenient behavior, because those may not have </script>
  103. # -radix
  104. if (self.tagName == 'script' and 'src' not in self.tagAttributes):
  105. # we do this ourselves rather than having begin_waitforendscript
  106. # because that can get called multiple times and we don't want
  107. # bodydata to get reset other than the first time.
  108. self.begin_bodydata(None)
  109. return 'waitforendscript'
  110. return 'bodydata'
  111. def dataReceived(self, data):
  112. stateTable = self._buildStateTable()
  113. if not self.state:
  114. # all UTF-16 starts with this string
  115. if data.startswith((b'\xff\xfe', b'\xfe\xff')):
  116. self._prepend = data[0:2]
  117. self.encodings.append('UTF-16')
  118. data = data[2:]
  119. self.state = 'begin'
  120. if self.encodings:
  121. data = self._decode(data)
  122. else:
  123. data = data.decode("utf-8")
  124. # bring state, lineno, colno into local scope
  125. lineno, colno = self.lineno, self.colno
  126. curState = self.state
  127. # replace saveMark with a nested scope function
  128. _saveMark = self.saveMark
  129. def saveMark():
  130. return (lineno, colno)
  131. self.saveMark = saveMark
  132. # fetch functions from the stateTable
  133. beginFn, doFn, endFn = stateTable[curState]
  134. try:
  135. for byte in data:
  136. # do newline stuff
  137. if byte == u'\n':
  138. lineno += 1
  139. colno = 0
  140. else:
  141. colno += 1
  142. newState = doFn(byte)
  143. if newState is not None and newState != curState:
  144. # this is the endFn from the previous state
  145. endFn()
  146. curState = newState
  147. beginFn, doFn, endFn = stateTable[curState]
  148. beginFn(byte)
  149. finally:
  150. self.saveMark = _saveMark
  151. self.lineno, self.colno = lineno, colno
  152. # state doesn't make sense if there's an exception..
  153. self.state = curState
  154. def connectionLost(self, reason):
  155. """
  156. End the last state we were in.
  157. """
  158. stateTable = self._buildStateTable()
  159. stateTable[self.state][END_HANDLER]()
  160. # state methods
  161. def do_begin(self, byte):
  162. if byte.isspace():
  163. return
  164. if byte != '<':
  165. if self.beExtremelyLenient:
  166. self._leadingBodyData = byte
  167. return 'bodydata'
  168. self._parseError("First char of document [%r] wasn't <" % (byte,))
  169. return 'tagstart'
  170. def begin_comment(self, byte):
  171. self.commentbuf = ''
  172. def do_comment(self, byte):
  173. self.commentbuf += byte
  174. if self.commentbuf.endswith('-->'):
  175. self.gotComment(self.commentbuf[:-3])
  176. return 'bodydata'
  177. def begin_tagstart(self, byte):
  178. self.tagName = '' # name of the tag
  179. self.tagAttributes = {} # attributes of the tag
  180. self.termtag = 0 # is the tag self-terminating
  181. self.endtag = 0
  182. def do_tagstart(self, byte):
  183. if byte.isalnum() or byte in identChars:
  184. self.tagName += byte
  185. if self.tagName == '!--':
  186. return 'comment'
  187. elif byte.isspace():
  188. if self.tagName:
  189. if self.endtag:
  190. # properly strict thing to do here is probably to only
  191. # accept whitespace
  192. return 'waitforgt'
  193. return 'attrs'
  194. else:
  195. self._parseError("Whitespace before tag-name")
  196. elif byte == '>':
  197. if self.endtag:
  198. self.gotTagEnd(self.tagName)
  199. return 'bodydata'
  200. else:
  201. self.gotTagStart(self.tagName, {})
  202. return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
  203. elif byte == '/':
  204. if self.tagName:
  205. return 'afterslash'
  206. else:
  207. self.endtag = 1
  208. elif byte in '!?':
  209. if self.tagName:
  210. if not self.beExtremelyLenient:
  211. self._parseError("Invalid character in tag-name")
  212. else:
  213. self.tagName += byte
  214. self.termtag = 1
  215. elif byte == '[':
  216. if self.tagName == '!':
  217. return 'expectcdata'
  218. else:
  219. self._parseError("Invalid '[' in tag-name")
  220. else:
  221. if self.beExtremelyLenient:
  222. self.bodydata = '<'
  223. return 'unentity'
  224. self._parseError('Invalid tag character: %r'% byte)
  225. def begin_unentity(self, byte):
  226. self.bodydata += byte
  227. def do_unentity(self, byte):
  228. self.bodydata += byte
  229. return 'bodydata'
  230. def end_unentity(self):
  231. self.gotText(self.bodydata)
  232. def begin_expectcdata(self, byte):
  233. self.cdatabuf = byte
  234. def do_expectcdata(self, byte):
  235. self.cdatabuf += byte
  236. cdb = self.cdatabuf
  237. cd = '[CDATA['
  238. if len(cd) > len(cdb):
  239. if cd.startswith(cdb):
  240. return
  241. elif self.beExtremelyLenient:
  242. ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
  243. ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
  244. ## 'em as best I can. this should really be a separate parse
  245. ## state but I don't even have any idea what these _are_.
  246. return 'waitforgt'
  247. else:
  248. self._parseError("Mal-formed CDATA header")
  249. if cd == cdb:
  250. self.cdatabuf = ''
  251. return 'cdata'
  252. self._parseError("Mal-formed CDATA header")
  253. def do_cdata(self, byte):
  254. self.cdatabuf += byte
  255. if self.cdatabuf.endswith("]]>"):
  256. self.cdatabuf = self.cdatabuf[:-3]
  257. return 'bodydata'
  258. def end_cdata(self):
  259. self.gotCData(self.cdatabuf)
  260. self.cdatabuf = ''
  261. def do_attrs(self, byte):
  262. if byte.isalnum() or byte in identChars:
  263. # XXX FIXME really handle !DOCTYPE at some point
  264. if self.tagName == '!DOCTYPE':
  265. return 'doctype'
  266. if self.tagName[0] in '!?':
  267. return 'waitforgt'
  268. return 'attrname'
  269. elif byte.isspace():
  270. return
  271. elif byte == '>':
  272. self.gotTagStart(self.tagName, self.tagAttributes)
  273. return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
  274. elif byte == '/':
  275. return 'afterslash'
  276. elif self.beExtremelyLenient:
  277. # discard and move on? Only case I've seen of this so far was:
  278. # <foo bar="baz"">
  279. return
  280. self._parseError("Unexpected character: %r" % byte)
  281. def begin_doctype(self, byte):
  282. self.doctype = byte
  283. def do_doctype(self, byte):
  284. if byte == '>':
  285. return 'bodydata'
  286. self.doctype += byte
  287. def end_doctype(self):
  288. self.gotDoctype(self.doctype)
  289. self.doctype = None
  290. def do_waitforgt(self, byte):
  291. if byte == '>':
  292. if self.endtag or not self.beExtremelyLenient:
  293. return 'bodydata'
  294. return self.maybeBodyData()
  295. def begin_attrname(self, byte):
  296. self.attrname = byte
  297. self._attrname_termtag = 0
  298. def do_attrname(self, byte):
  299. if byte.isalnum() or byte in identChars:
  300. self.attrname += byte
  301. return
  302. elif byte == '=':
  303. return 'beforeattrval'
  304. elif byte.isspace():
  305. return 'beforeeq'
  306. elif self.beExtremelyLenient:
  307. if byte in '"\'':
  308. return 'attrval'
  309. if byte in lenientIdentChars or byte.isalnum():
  310. self.attrname += byte
  311. return
  312. if byte == '/':
  313. self._attrname_termtag = 1
  314. return
  315. if byte == '>':
  316. self.attrval = 'True'
  317. self.tagAttributes[self.attrname] = self.attrval
  318. self.gotTagStart(self.tagName, self.tagAttributes)
  319. if self._attrname_termtag:
  320. self.gotTagEnd(self.tagName)
  321. return 'bodydata'
  322. return self.maybeBodyData()
  323. # something is really broken. let's leave this attribute where it
  324. # is and move on to the next thing
  325. return
  326. self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
  327. def do_beforeattrval(self, byte):
  328. if byte in '"\'':
  329. return 'attrval'
  330. elif byte.isspace():
  331. return
  332. elif self.beExtremelyLenient:
  333. if byte in lenientIdentChars or byte.isalnum():
  334. return 'messyattr'
  335. if byte == '>':
  336. self.attrval = 'True'
  337. self.tagAttributes[self.attrname] = self.attrval
  338. self.gotTagStart(self.tagName, self.tagAttributes)
  339. return self.maybeBodyData()
  340. if byte == '\\':
  341. # I saw this in actual HTML once:
  342. # <font size=\"3\"><sup>SM</sup></font>
  343. return
  344. self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
  345. attrname = ''
  346. attrval = ''
  347. def begin_beforeeq(self,byte):
  348. self._beforeeq_termtag = 0
  349. def do_beforeeq(self, byte):
  350. if byte == '=':
  351. return 'beforeattrval'
  352. elif byte.isspace():
  353. return
  354. elif self.beExtremelyLenient:
  355. if byte.isalnum() or byte in identChars:
  356. self.attrval = 'True'
  357. self.tagAttributes[self.attrname] = self.attrval
  358. return 'attrname'
  359. elif byte == '>':
  360. self.attrval = 'True'
  361. self.tagAttributes[self.attrname] = self.attrval
  362. self.gotTagStart(self.tagName, self.tagAttributes)
  363. if self._beforeeq_termtag:
  364. self.gotTagEnd(self.tagName)
  365. return 'bodydata'
  366. return self.maybeBodyData()
  367. elif byte == '/':
  368. self._beforeeq_termtag = 1
  369. return
  370. self._parseError("Invalid attribute")
  371. def begin_attrval(self, byte):
  372. self.quotetype = byte
  373. self.attrval = ''
  374. def do_attrval(self, byte):
  375. if byte == self.quotetype:
  376. return 'attrs'
  377. self.attrval += byte
  378. def end_attrval(self):
  379. self.tagAttributes[self.attrname] = self.attrval
  380. self.attrname = self.attrval = ''
  381. def begin_messyattr(self, byte):
  382. self.attrval = byte
  383. def do_messyattr(self, byte):
  384. if byte.isspace():
  385. return 'attrs'
  386. elif byte == '>':
  387. endTag = 0
  388. if self.attrval.endswith('/'):
  389. endTag = 1
  390. self.attrval = self.attrval[:-1]
  391. self.tagAttributes[self.attrname] = self.attrval
  392. self.gotTagStart(self.tagName, self.tagAttributes)
  393. if endTag:
  394. self.gotTagEnd(self.tagName)
  395. return 'bodydata'
  396. return self.maybeBodyData()
  397. else:
  398. self.attrval += byte
  399. def end_messyattr(self):
  400. if self.attrval:
  401. self.tagAttributes[self.attrname] = self.attrval
  402. def begin_afterslash(self, byte):
  403. self._after_slash_closed = 0
  404. def do_afterslash(self, byte):
  405. # this state is only after a self-terminating slash, e.g. <foo/>
  406. if self._after_slash_closed:
  407. self._parseError("Mal-formed")#XXX When does this happen??
  408. if byte != '>':
  409. if self.beExtremelyLenient:
  410. return
  411. else:
  412. self._parseError("No data allowed after '/'")
  413. self._after_slash_closed = 1
  414. self.gotTagStart(self.tagName, self.tagAttributes)
  415. self.gotTagEnd(self.tagName)
  416. # don't need maybeBodyData here because there better not be
  417. # any javascript code after a <script/>... we'll see :(
  418. return 'bodydata'
  419. def begin_bodydata(self, byte):
  420. if self._leadingBodyData:
  421. self.bodydata = self._leadingBodyData
  422. del self._leadingBodyData
  423. else:
  424. self.bodydata = ''
  425. def do_bodydata(self, byte):
  426. if byte == '<':
  427. return 'tagstart'
  428. if byte == '&':
  429. return 'entityref'
  430. self.bodydata += byte
  431. def end_bodydata(self):
  432. self.gotText(self.bodydata)
  433. self.bodydata = ''
  434. def do_waitforendscript(self, byte):
  435. if byte == '<':
  436. return 'waitscriptendtag'
  437. self.bodydata += byte
  438. def begin_waitscriptendtag(self, byte):
  439. self.temptagdata = ''
  440. self.tagName = ''
  441. self.endtag = 0
  442. def do_waitscriptendtag(self, byte):
  443. # 1 enforce / as first byte read
  444. # 2 enforce following bytes to be subset of "script" until
  445. # tagName == "script"
  446. # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
  447. # 3 spaces can happen anywhere, they're ignored
  448. # e.g. < / script >
  449. # 4 anything else causes all data I've read to be moved to the
  450. # bodydata, and switch back to waitforendscript state
  451. # If it turns out this _isn't_ a </script>, we need to
  452. # remember all the data we've been through so we can append it
  453. # to bodydata
  454. self.temptagdata += byte
  455. # 1
  456. if byte == '/':
  457. self.endtag = True
  458. elif not self.endtag:
  459. self.bodydata += "<" + self.temptagdata
  460. return 'waitforendscript'
  461. # 2
  462. elif byte.isalnum() or byte in identChars:
  463. self.tagName += byte
  464. if not 'script'.startswith(self.tagName):
  465. self.bodydata += "<" + self.temptagdata
  466. return 'waitforendscript'
  467. elif self.tagName == 'script':
  468. self.gotText(self.bodydata)
  469. self.gotTagEnd(self.tagName)
  470. return 'waitforgt'
  471. # 3
  472. elif byte.isspace():
  473. return 'waitscriptendtag'
  474. # 4
  475. else:
  476. self.bodydata += "<" + self.temptagdata
  477. return 'waitforendscript'
  478. def begin_entityref(self, byte):
  479. self.erefbuf = ''
  480. self.erefextra = '' # extra bit for lenient mode
  481. def do_entityref(self, byte):
  482. if byte.isspace() or byte == "<":
  483. if self.beExtremelyLenient:
  484. # '&foo' probably was '&amp;foo'
  485. if self.erefbuf and self.erefbuf != "amp":
  486. self.erefextra = self.erefbuf
  487. self.erefbuf = "amp"
  488. if byte == "<":
  489. return "tagstart"
  490. else:
  491. self.erefextra += byte
  492. return 'spacebodydata'
  493. self._parseError("Bad entity reference")
  494. elif byte != ';':
  495. self.erefbuf += byte
  496. else:
  497. return 'bodydata'
  498. def end_entityref(self):
  499. self.gotEntityReference(self.erefbuf)
  500. # hacky support for space after & in entityref in beExtremelyLenient
  501. # state should only happen in that case
  502. def begin_spacebodydata(self, byte):
  503. self.bodydata = self.erefextra
  504. self.erefextra = None
  505. do_spacebodydata = do_bodydata
  506. end_spacebodydata = end_bodydata
  507. # Sorta SAX-ish API
  508. def gotTagStart(self, name, attributes):
  509. '''Encountered an opening tag.
  510. Default behaviour is to print.'''
  511. print('begin', name, attributes)
  512. def gotText(self, data):
  513. '''Encountered text
  514. Default behaviour is to print.'''
  515. print('text:', repr(data))
  516. def gotEntityReference(self, entityRef):
  517. '''Encountered mnemonic entity reference
  518. Default behaviour is to print.'''
  519. print('entityRef: &%s;' % entityRef)
  520. def gotComment(self, comment):
  521. '''Encountered comment.
  522. Default behaviour is to ignore.'''
  523. pass
  524. def gotCData(self, cdata):
  525. '''Encountered CDATA
  526. Default behaviour is to call the gotText method'''
  527. self.gotText(cdata)
  528. def gotDoctype(self, doctype):
  529. """Encountered DOCTYPE
  530. This is really grotty: it basically just gives you everything between
  531. '<!DOCTYPE' and '>' as an argument.
  532. """
  533. print('!DOCTYPE', repr(doctype))
  534. def gotTagEnd(self, name):
  535. '''Encountered closing tag
  536. Default behaviour is to print.'''
  537. print('end', name)