123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637 |
- # -*- test-case-name: twisted.web.test.test_xml -*-
- #
- # Copyright (c) Twisted Matrix Laboratories.
- # See LICENSE for details.
- """
- *S*mall, *U*ncomplicated *X*ML.
- This is a very simple implementation of XML/HTML as a network
- protocol. It is not at all clever. Its main features are that it
- does not:
- - support namespaces
- - mung mnemonic entity references
- - validate
- - perform *any* external actions (such as fetching URLs or writing files)
- under *any* circumstances
- - has lots and lots of horrible hacks for supporting broken HTML (as an
- option, they're not on by default).
- """
- from __future__ import print_function
- from twisted.internet.protocol import Protocol
- from twisted.python.compat import unicode
- from twisted.python.reflect import prefixedMethodNames
- # Elements of the three-tuples in the state table.
- identChars = '.-_:'
- lenientIdentChars = identChars + ';+#/%~'
- def nop(*args, **kw):
- "Do nothing."
- def unionlist(*args):
- l = []
- for x in args:
- l.extend(x)
- d = dict([(x, 1) for x in l])
- return d.keys()
- def zipfndict(*args, **kw):
- default = kw.get('default', nop)
- d = {}
- for key in unionlist(*[fndict.keys() for fndict in args]):
- d[key] = tuple([x.get(key, default) for x in args])
- return d
- def prefixedMethodClassDict(clazz, prefix):
- return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
- def prefixedMethodObjDict(obj, prefix):
- return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
- class ParseError(Exception):
- def __init__(self, filename, line, col, message):
- self.filename = filename
- self.line = line
- self.col = col
- self.message = message
- def __str__(self):
- return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
- self.message)
- class XMLParser(Protocol):
- state = None
- encodings = None
- filename = "<xml />"
- beExtremelyLenient = 0
- _prepend = None
- # _leadingBodyData will sometimes be set before switching to the
- # 'bodydata' state, when we "accidentally" read a byte of bodydata
- # in a different state.
- _leadingBodyData = None
- def connectionMade(self):
- self.lineno = 1
- self.colno = 0
- self.encodings = []
- def saveMark(self):
- '''Get the line number and column of the last character parsed'''
- # This gets replaced during dataReceived, restored afterwards
- return (self.lineno, self.colno)
- def _parseError(self, message):
- raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
- def _buildStateTable(self):
- '''Return a dictionary of begin, do, end state function tuples'''
- # _buildStateTable leaves something to be desired but it does what it
- # does.. probably slowly, so I'm doing some evil caching so it doesn't
- # get called more than once per class.
- stateTable = getattr(self.__class__, '__stateTable', None)
- if stateTable is None:
- stateTable = self.__class__.__stateTable = zipfndict(
- *[prefixedMethodObjDict(self, prefix)
- for prefix in ('begin_', 'do_', 'end_')])
- return stateTable
- def _decode(self, data):
- if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
- assert not len(data) & 1, 'UTF-16 must come in pairs for now'
- if self._prepend:
- data = self._prepend + data
- for encoding in self.encodings:
- data = unicode(data, encoding)
- return data
- def maybeBodyData(self):
- if self.endtag:
- return 'bodydata'
- # Get ready for fun! We're going to allow
- # <script>if (foo < bar)</script> to work!
- # We do this by making everything between <script> and
- # </script> a Text
- # BUT <script src="foo"> will be special-cased to do regular,
- # lenient behavior, because those may not have </script>
- # -radix
- if (self.tagName == 'script' and 'src' not in self.tagAttributes):
- # we do this ourselves rather than having begin_waitforendscript
- # because that can get called multiple times and we don't want
- # bodydata to get reset other than the first time.
- self.begin_bodydata(None)
- return 'waitforendscript'
- return 'bodydata'
- def dataReceived(self, data):
- stateTable = self._buildStateTable()
- if not self.state:
- # all UTF-16 starts with this string
- if data.startswith((b'\xff\xfe', b'\xfe\xff')):
- self._prepend = data[0:2]
- self.encodings.append('UTF-16')
- data = data[2:]
- self.state = 'begin'
- if self.encodings:
- data = self._decode(data)
- else:
- data = data.decode("utf-8")
- # bring state, lineno, colno into local scope
- lineno, colno = self.lineno, self.colno
- curState = self.state
- # replace saveMark with a nested scope function
- _saveMark = self.saveMark
- def saveMark():
- return (lineno, colno)
- self.saveMark = saveMark
- # fetch functions from the stateTable
- beginFn, doFn, endFn = stateTable[curState]
- try:
- for byte in data:
- # do newline stuff
- if byte == u'\n':
- lineno += 1
- colno = 0
- else:
- colno += 1
- newState = doFn(byte)
- if newState is not None and newState != curState:
- # this is the endFn from the previous state
- endFn()
- curState = newState
- beginFn, doFn, endFn = stateTable[curState]
- beginFn(byte)
- finally:
- self.saveMark = _saveMark
- self.lineno, self.colno = lineno, colno
- # state doesn't make sense if there's an exception..
- self.state = curState
- def connectionLost(self, reason):
- """
- End the last state we were in.
- """
- stateTable = self._buildStateTable()
- stateTable[self.state][END_HANDLER]()
- # state methods
- def do_begin(self, byte):
- if byte.isspace():
- return
- if byte != '<':
- if self.beExtremelyLenient:
- self._leadingBodyData = byte
- return 'bodydata'
- self._parseError("First char of document [%r] wasn't <" % (byte,))
- return 'tagstart'
- def begin_comment(self, byte):
- self.commentbuf = ''
- def do_comment(self, byte):
- self.commentbuf += byte
- if self.commentbuf.endswith('-->'):
- self.gotComment(self.commentbuf[:-3])
- return 'bodydata'
- def begin_tagstart(self, byte):
- self.tagName = '' # name of the tag
- self.tagAttributes = {} # attributes of the tag
- self.termtag = 0 # is the tag self-terminating
- self.endtag = 0
- def do_tagstart(self, byte):
- if byte.isalnum() or byte in identChars:
- self.tagName += byte
- if self.tagName == '!--':
- return 'comment'
- elif byte.isspace():
- if self.tagName:
- if self.endtag:
- # properly strict thing to do here is probably to only
- # accept whitespace
- return 'waitforgt'
- return 'attrs'
- else:
- self._parseError("Whitespace before tag-name")
- elif byte == '>':
- if self.endtag:
- self.gotTagEnd(self.tagName)
- return 'bodydata'
- else:
- self.gotTagStart(self.tagName, {})
- return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
- elif byte == '/':
- if self.tagName:
- return 'afterslash'
- else:
- self.endtag = 1
- elif byte in '!?':
- if self.tagName:
- if not self.beExtremelyLenient:
- self._parseError("Invalid character in tag-name")
- else:
- self.tagName += byte
- self.termtag = 1
- elif byte == '[':
- if self.tagName == '!':
- return 'expectcdata'
- else:
- self._parseError("Invalid '[' in tag-name")
- else:
- if self.beExtremelyLenient:
- self.bodydata = '<'
- return 'unentity'
- self._parseError('Invalid tag character: %r'% byte)
- def begin_unentity(self, byte):
- self.bodydata += byte
- def do_unentity(self, byte):
- self.bodydata += byte
- return 'bodydata'
- def end_unentity(self):
- self.gotText(self.bodydata)
- def begin_expectcdata(self, byte):
- self.cdatabuf = byte
- def do_expectcdata(self, byte):
- self.cdatabuf += byte
- cdb = self.cdatabuf
- cd = '[CDATA['
- if len(cd) > len(cdb):
- if cd.startswith(cdb):
- return
- elif self.beExtremelyLenient:
- ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
- ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
- ## 'em as best I can. this should really be a separate parse
- ## state but I don't even have any idea what these _are_.
- return 'waitforgt'
- else:
- self._parseError("Mal-formed CDATA header")
- if cd == cdb:
- self.cdatabuf = ''
- return 'cdata'
- self._parseError("Mal-formed CDATA header")
- def do_cdata(self, byte):
- self.cdatabuf += byte
- if self.cdatabuf.endswith("]]>"):
- self.cdatabuf = self.cdatabuf[:-3]
- return 'bodydata'
- def end_cdata(self):
- self.gotCData(self.cdatabuf)
- self.cdatabuf = ''
- def do_attrs(self, byte):
- if byte.isalnum() or byte in identChars:
- # XXX FIXME really handle !DOCTYPE at some point
- if self.tagName == '!DOCTYPE':
- return 'doctype'
- if self.tagName[0] in '!?':
- return 'waitforgt'
- return 'attrname'
- elif byte.isspace():
- return
- elif byte == '>':
- self.gotTagStart(self.tagName, self.tagAttributes)
- return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
- elif byte == '/':
- return 'afterslash'
- elif self.beExtremelyLenient:
- # discard and move on? Only case I've seen of this so far was:
- # <foo bar="baz"">
- return
- self._parseError("Unexpected character: %r" % byte)
- def begin_doctype(self, byte):
- self.doctype = byte
- def do_doctype(self, byte):
- if byte == '>':
- return 'bodydata'
- self.doctype += byte
- def end_doctype(self):
- self.gotDoctype(self.doctype)
- self.doctype = None
- def do_waitforgt(self, byte):
- if byte == '>':
- if self.endtag or not self.beExtremelyLenient:
- return 'bodydata'
- return self.maybeBodyData()
- def begin_attrname(self, byte):
- self.attrname = byte
- self._attrname_termtag = 0
- def do_attrname(self, byte):
- if byte.isalnum() or byte in identChars:
- self.attrname += byte
- return
- elif byte == '=':
- return 'beforeattrval'
- elif byte.isspace():
- return 'beforeeq'
- elif self.beExtremelyLenient:
- if byte in '"\'':
- return 'attrval'
- if byte in lenientIdentChars or byte.isalnum():
- self.attrname += byte
- return
- if byte == '/':
- self._attrname_termtag = 1
- return
- if byte == '>':
- self.attrval = 'True'
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- if self._attrname_termtag:
- self.gotTagEnd(self.tagName)
- return 'bodydata'
- return self.maybeBodyData()
- # something is really broken. let's leave this attribute where it
- # is and move on to the next thing
- return
- self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
- def do_beforeattrval(self, byte):
- if byte in '"\'':
- return 'attrval'
- elif byte.isspace():
- return
- elif self.beExtremelyLenient:
- if byte in lenientIdentChars or byte.isalnum():
- return 'messyattr'
- if byte == '>':
- self.attrval = 'True'
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- return self.maybeBodyData()
- if byte == '\\':
- # I saw this in actual HTML once:
- # <font size=\"3\"><sup>SM</sup></font>
- return
- self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
- attrname = ''
- attrval = ''
- def begin_beforeeq(self,byte):
- self._beforeeq_termtag = 0
- def do_beforeeq(self, byte):
- if byte == '=':
- return 'beforeattrval'
- elif byte.isspace():
- return
- elif self.beExtremelyLenient:
- if byte.isalnum() or byte in identChars:
- self.attrval = 'True'
- self.tagAttributes[self.attrname] = self.attrval
- return 'attrname'
- elif byte == '>':
- self.attrval = 'True'
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- if self._beforeeq_termtag:
- self.gotTagEnd(self.tagName)
- return 'bodydata'
- return self.maybeBodyData()
- elif byte == '/':
- self._beforeeq_termtag = 1
- return
- self._parseError("Invalid attribute")
- def begin_attrval(self, byte):
- self.quotetype = byte
- self.attrval = ''
- def do_attrval(self, byte):
- if byte == self.quotetype:
- return 'attrs'
- self.attrval += byte
- def end_attrval(self):
- self.tagAttributes[self.attrname] = self.attrval
- self.attrname = self.attrval = ''
- def begin_messyattr(self, byte):
- self.attrval = byte
- def do_messyattr(self, byte):
- if byte.isspace():
- return 'attrs'
- elif byte == '>':
- endTag = 0
- if self.attrval.endswith('/'):
- endTag = 1
- self.attrval = self.attrval[:-1]
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- if endTag:
- self.gotTagEnd(self.tagName)
- return 'bodydata'
- return self.maybeBodyData()
- else:
- self.attrval += byte
- def end_messyattr(self):
- if self.attrval:
- self.tagAttributes[self.attrname] = self.attrval
- def begin_afterslash(self, byte):
- self._after_slash_closed = 0
- def do_afterslash(self, byte):
- # this state is only after a self-terminating slash, e.g. <foo/>
- if self._after_slash_closed:
- self._parseError("Mal-formed")#XXX When does this happen??
- if byte != '>':
- if self.beExtremelyLenient:
- return
- else:
- self._parseError("No data allowed after '/'")
- self._after_slash_closed = 1
- self.gotTagStart(self.tagName, self.tagAttributes)
- self.gotTagEnd(self.tagName)
- # don't need maybeBodyData here because there better not be
- # any javascript code after a <script/>... we'll see :(
- return 'bodydata'
- def begin_bodydata(self, byte):
- if self._leadingBodyData:
- self.bodydata = self._leadingBodyData
- del self._leadingBodyData
- else:
- self.bodydata = ''
- def do_bodydata(self, byte):
- if byte == '<':
- return 'tagstart'
- if byte == '&':
- return 'entityref'
- self.bodydata += byte
- def end_bodydata(self):
- self.gotText(self.bodydata)
- self.bodydata = ''
- def do_waitforendscript(self, byte):
- if byte == '<':
- return 'waitscriptendtag'
- self.bodydata += byte
- def begin_waitscriptendtag(self, byte):
- self.temptagdata = ''
- self.tagName = ''
- self.endtag = 0
- def do_waitscriptendtag(self, byte):
- # 1 enforce / as first byte read
- # 2 enforce following bytes to be subset of "script" until
- # tagName == "script"
- # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
- # 3 spaces can happen anywhere, they're ignored
- # e.g. < / script >
- # 4 anything else causes all data I've read to be moved to the
- # bodydata, and switch back to waitforendscript state
- # If it turns out this _isn't_ a </script>, we need to
- # remember all the data we've been through so we can append it
- # to bodydata
- self.temptagdata += byte
- # 1
- if byte == '/':
- self.endtag = True
- elif not self.endtag:
- self.bodydata += "<" + self.temptagdata
- return 'waitforendscript'
- # 2
- elif byte.isalnum() or byte in identChars:
- self.tagName += byte
- if not 'script'.startswith(self.tagName):
- self.bodydata += "<" + self.temptagdata
- return 'waitforendscript'
- elif self.tagName == 'script':
- self.gotText(self.bodydata)
- self.gotTagEnd(self.tagName)
- return 'waitforgt'
- # 3
- elif byte.isspace():
- return 'waitscriptendtag'
- # 4
- else:
- self.bodydata += "<" + self.temptagdata
- return 'waitforendscript'
- def begin_entityref(self, byte):
- self.erefbuf = ''
- self.erefextra = '' # extra bit for lenient mode
- def do_entityref(self, byte):
- if byte.isspace() or byte == "<":
- if self.beExtremelyLenient:
- # '&foo' probably was '&foo'
- if self.erefbuf and self.erefbuf != "amp":
- self.erefextra = self.erefbuf
- self.erefbuf = "amp"
- if byte == "<":
- return "tagstart"
- else:
- self.erefextra += byte
- return 'spacebodydata'
- self._parseError("Bad entity reference")
- elif byte != ';':
- self.erefbuf += byte
- else:
- return 'bodydata'
- def end_entityref(self):
- self.gotEntityReference(self.erefbuf)
- # hacky support for space after & in entityref in beExtremelyLenient
- # state should only happen in that case
- def begin_spacebodydata(self, byte):
- self.bodydata = self.erefextra
- self.erefextra = None
- do_spacebodydata = do_bodydata
- end_spacebodydata = end_bodydata
- # Sorta SAX-ish API
- def gotTagStart(self, name, attributes):
- '''Encountered an opening tag.
- Default behaviour is to print.'''
- print('begin', name, attributes)
- def gotText(self, data):
- '''Encountered text
- Default behaviour is to print.'''
- print('text:', repr(data))
- def gotEntityReference(self, entityRef):
- '''Encountered mnemonic entity reference
- Default behaviour is to print.'''
- print('entityRef: &%s;' % entityRef)
- def gotComment(self, comment):
- '''Encountered comment.
- Default behaviour is to ignore.'''
- pass
- def gotCData(self, cdata):
- '''Encountered CDATA
- Default behaviour is to call the gotText method'''
- self.gotText(cdata)
- def gotDoctype(self, doctype):
- """Encountered DOCTYPE
- This is really grotty: it basically just gives you everything between
- '<!DOCTYPE' and '>' as an argument.
- """
- print('!DOCTYPE', repr(doctype))
- def gotTagEnd(self, name):
- '''Encountered closing tag
- Default behaviour is to print.'''
- print('end', name)