SMusatov
/
ydb
mirror of https://github.com/ydb-platform/ydb.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
							# -*- test-case-name: twisted.web.test.test_xml -*-
#
# Copyright (c) Twisted Matrix Laboratories.
# See LICENSE for details.


"""
*S*mall, *U*ncomplicated *X*ML.

This is a very simple implementation of XML/HTML as a network
protocol.  It is not at all clever.  Its main features are that it
does not:

  - support namespaces
  - mung mnemonic entity references
  - validate
  - perform *any* external actions (such as fetching URLs or writing files)
    under *any* circumstances
  - has lots and lots of horrible hacks for supporting broken HTML (as an
    option, they're not on by default).
"""


from twisted.internet.protocol import Protocol
from twisted.python.reflect import prefixedMethodNames

# Elements of the three-tuples in the state table.
BEGIN_HANDLER = 0
DO_HANDLER = 1
END_HANDLER = 2

identChars = ".-_:"
lenientIdentChars = identChars + ";+#/%~"


def nop(*args, **kw):
    "Do nothing."


def unionlist(*args):
    l = []
    for x in args:
        l.extend(x)
    d = {x: 1 for x in l}
    return d.keys()


def zipfndict(*args, **kw):
    default = kw.get("default", nop)
    d = {}
    for key in unionlist(*(fndict.keys() for fndict in args)):
        d[key] = tuple(x.get(key, default) for x in args)
    return d


def prefixedMethodClassDict(clazz, prefix):
    return {
        name: getattr(clazz, prefix + name)
        for name in prefixedMethodNames(clazz, prefix)
    }


def prefixedMethodObjDict(obj, prefix):
    return {
        name: getattr(obj, prefix + name)
        for name in prefixedMethodNames(obj.__class__, prefix)
    }


class ParseError(Exception):
    def __init__(self, filename, line, col, message):
        self.filename = filename
        self.line = line
        self.col = col
        self.message = message

    def __str__(self) -> str:
        return f"{self.filename}:{self.line}:{self.col}: {self.message}"


class XMLParser(Protocol):
    state = None
    encodings = None
    filename = "<xml />"
    beExtremelyLenient = 0
    _prepend = None

    # _leadingBodyData will sometimes be set before switching to the
    # 'bodydata' state, when we "accidentally" read a byte of bodydata
    # in a different state.
    _leadingBodyData = None

    def connectionMade(self):
        self.lineno = 1
        self.colno = 0
        self.encodings = []

    def saveMark(self):
        """Get the line number and column of the last character parsed"""
        # This gets replaced during dataReceived, restored afterwards
        return (self.lineno, self.colno)

    def _parseError(self, message):
        raise ParseError(*((self.filename,) + self.saveMark() + (message,)))

    def _buildStateTable(self):
        """Return a dictionary of begin, do, end state function tuples"""
        # _buildStateTable leaves something to be desired but it does what it
        # does.. probably slowly, so I'm doing some evil caching so it doesn't
        # get called more than once per class.
        stateTable = getattr(self.__class__, "__stateTable", None)
        if stateTable is None:
            stateTable = self.__class__.__stateTable = zipfndict(
                *(
                    prefixedMethodObjDict(self, prefix)
                    for prefix in ("begin_", "do_", "end_")
                )
            )
        return stateTable

    def _decode(self, data):
        if "UTF-16" in self.encodings or "UCS-2" in self.encodings:
            assert not len(data) & 1, "UTF-16 must come in pairs for now"
        if self._prepend:
            data = self._prepend + data
        for encoding in self.encodings:
            data = str(data, encoding)
        return data

    def maybeBodyData(self):
        if self.endtag:
            return "bodydata"

        # Get ready for fun! We're going to allow
        # <script>if (foo < bar)</script> to work!
        # We do this by making everything between <script> and
        # </script> a Text
        # BUT <script src="foo"> will be special-cased to do regular,
        # lenient behavior, because those may not have </script>
        # -radix

        if self.tagName == "script" and "src" not in self.tagAttributes:
            # we do this ourselves rather than having begin_waitforendscript
            # because that can get called multiple times and we don't want
            # bodydata to get reset other than the first time.
            self.begin_bodydata(None)
            return "waitforendscript"
        return "bodydata"

    def dataReceived(self, data):
        stateTable = self._buildStateTable()
        if not self.state:
            # all UTF-16 starts with this string
            if data.startswith((b"\xff\xfe", b"\xfe\xff")):
                self._prepend = data[0:2]
                self.encodings.append("UTF-16")
                data = data[2:]
            self.state = "begin"
        if self.encodings:
            data = self._decode(data)
        else:
            data = data.decode("utf-8")
        # bring state, lineno, colno into local scope
        lineno, colno = self.lineno, self.colno
        curState = self.state
        # replace saveMark with a nested scope function
        _saveMark = self.saveMark

        def saveMark():
            return (lineno, colno)

        self.saveMark = saveMark
        # fetch functions from the stateTable
        beginFn, doFn, endFn = stateTable[curState]
        try:
            for byte in data:
                # do newline stuff
                if byte == "\n":
                    lineno += 1
                    colno = 0
                else:
                    colno += 1
                newState = doFn(byte)
                if newState is not None and newState != curState:
                    # this is the endFn from the previous state
                    endFn()
                    curState = newState
                    beginFn, doFn, endFn = stateTable[curState]
                    beginFn(byte)
        finally:
            self.saveMark = _saveMark
            self.lineno, self.colno = lineno, colno
        # state doesn't make sense if there's an exception..
        self.state = curState

    def connectionLost(self, reason):
        """
        End the last state we were in.
        """
        stateTable = self._buildStateTable()
        stateTable[self.state][END_HANDLER]()

    # state methods

    def do_begin(self, byte):
        if byte.isspace():
            return
        if byte != "<":
            if self.beExtremelyLenient:
                self._leadingBodyData = byte
                return "bodydata"
            self._parseError(f"First char of document [{byte!r}] wasn't <")
        return "tagstart"

    def begin_comment(self, byte):
        self.commentbuf = ""

    def do_comment(self, byte):
        self.commentbuf += byte
        if self.commentbuf.endswith("-->"):
            self.gotComment(self.commentbuf[:-3])
            return "bodydata"

    def begin_tagstart(self, byte):
        self.tagName = ""  # name of the tag
        self.tagAttributes = {}  # attributes of the tag
        self.termtag = 0  # is the tag self-terminating
        self.endtag = 0

    def do_tagstart(self, byte):
        if byte.isalnum() or byte in identChars:
            self.tagName += byte
            if self.tagName == "!--":
                return "comment"
        elif byte.isspace():
            if self.tagName:
                if self.endtag:
                    # properly strict thing to do here is probably to only
                    # accept whitespace
                    return "waitforgt"
                return "attrs"
            else:
                self._parseError("Whitespace before tag-name")
        elif byte == ">":
            if self.endtag:
                self.gotTagEnd(self.tagName)
                return "bodydata"
            else:
                self.gotTagStart(self.tagName, {})
                return (
                    (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
                )
        elif byte == "/":
            if self.tagName:
                return "afterslash"
            else:
                self.endtag = 1
        elif byte in "!?":
            if self.tagName:
                if not self.beExtremelyLenient:
                    self._parseError("Invalid character in tag-name")
            else:
                self.tagName += byte
                self.termtag = 1
        elif byte == "[":
            if self.tagName == "!":
                return "expectcdata"
            else:
                self._parseError("Invalid '[' in tag-name")
        else:
            if self.beExtremelyLenient:
                self.bodydata = "<"
                return "unentity"
            self._parseError("Invalid tag character: %r" % byte)

    def begin_unentity(self, byte):
        self.bodydata += byte

    def do_unentity(self, byte):
        self.bodydata += byte
        return "bodydata"

    def end_unentity(self):
        self.gotText(self.bodydata)

    def begin_expectcdata(self, byte):
        self.cdatabuf = byte

    def do_expectcdata(self, byte):
        self.cdatabuf += byte
        cdb = self.cdatabuf
        cd = "[CDATA["
        if len(cd) > len(cdb):
            if cd.startswith(cdb):
                return
            elif self.beExtremelyLenient:
                ## WHAT THE CRAP!?  MSWord9 generates HTML that includes these
                ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
                ## 'em as best I can.  this should really be a separate parse
                ## state but I don't even have any idea what these _are_.
                return "waitforgt"
            else:
                self._parseError("Mal-formed CDATA header")
        if cd == cdb:
            self.cdatabuf = ""
            return "cdata"
        self._parseError("Mal-formed CDATA header")

    def do_cdata(self, byte):
        self.cdatabuf += byte
        if self.cdatabuf.endswith("]]>"):
            self.cdatabuf = self.cdatabuf[:-3]
            return "bodydata"

    def end_cdata(self):
        self.gotCData(self.cdatabuf)
        self.cdatabuf = ""

    def do_attrs(self, byte):
        if byte.isalnum() or byte in identChars:
            # XXX FIXME really handle !DOCTYPE at some point
            if self.tagName == "!DOCTYPE":
                return "doctype"
            if self.tagName[0] in "!?":
                return "waitforgt"
            return "attrname"
        elif byte.isspace():
            return
        elif byte == ">":
            self.gotTagStart(self.tagName, self.tagAttributes)
            return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
        elif byte == "/":
            return "afterslash"
        elif self.beExtremelyLenient:
            # discard and move on?  Only case I've seen of this so far was:
            # <foo bar="baz"">
            return
        self._parseError("Unexpected character: %r" % byte)

    def begin_doctype(self, byte):
        self.doctype = byte

    def do_doctype(self, byte):
        if byte == ">":
            return "bodydata"
        self.doctype += byte

    def end_doctype(self):
        self.gotDoctype(self.doctype)
        self.doctype = None

    def do_waitforgt(self, byte):
        if byte == ">":
            if self.endtag or not self.beExtremelyLenient:
                return "bodydata"
            return self.maybeBodyData()

    def begin_attrname(self, byte):
        self.attrname = byte
        self._attrname_termtag = 0

    def do_attrname(self, byte):
        if byte.isalnum() or byte in identChars:
            self.attrname += byte
            return
        elif byte == "=":
            return "beforeattrval"
        elif byte.isspace():
            return "beforeeq"
        elif self.beExtremelyLenient:
            if byte in "\"'":
                return "attrval"
            if byte in lenientIdentChars or byte.isalnum():
                self.attrname += byte
                return
            if byte == "/":
                self._attrname_termtag = 1
                return
            if byte == ">":
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                if self._attrname_termtag:
                    self.gotTagEnd(self.tagName)
                    return "bodydata"
                return self.maybeBodyData()
            # something is really broken. let's leave this attribute where it
            # is and move on to the next thing
            return
        self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}")

    def do_beforeattrval(self, byte):
        if byte in "\"'":
            return "attrval"
        elif byte.isspace():
            return
        elif self.beExtremelyLenient:
            if byte in lenientIdentChars or byte.isalnum():
                return "messyattr"
            if byte == ">":
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                return self.maybeBodyData()
            if byte == "\\":
                # I saw this in actual HTML once:
                # <font size=\"3\"><sup>SM</sup></font>
                return
        self._parseError(
            "Invalid initial attribute value: %r; Attribute values must be quoted."
            % byte
        )

    attrname = ""
    attrval = ""

    def begin_beforeeq(self, byte):
        self._beforeeq_termtag = 0

    def do_beforeeq(self, byte):
        if byte == "=":
            return "beforeattrval"
        elif byte.isspace():
            return
        elif self.beExtremelyLenient:
            if byte.isalnum() or byte in identChars:
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                return "attrname"
            elif byte == ">":
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                if self._beforeeq_termtag:
                    self.gotTagEnd(self.tagName)
                    return "bodydata"
                return self.maybeBodyData()
            elif byte == "/":
                self._beforeeq_termtag = 1
                return
        self._parseError("Invalid attribute")

    def begin_attrval(self, byte):
        self.quotetype = byte
        self.attrval = ""

    def do_attrval(self, byte):
        if byte == self.quotetype:
            return "attrs"
        self.attrval += byte

    def end_attrval(self):
        self.tagAttributes[self.attrname] = self.attrval
        self.attrname = self.attrval = ""

    def begin_messyattr(self, byte):
        self.attrval = byte

    def do_messyattr(self, byte):
        if byte.isspace():
            return "attrs"
        elif byte == ">":
            endTag = 0
            if self.attrval.endswith("/"):
                endTag = 1
                self.attrval = self.attrval[:-1]
            self.tagAttributes[self.attrname] = self.attrval
            self.gotTagStart(self.tagName, self.tagAttributes)
            if endTag:
                self.gotTagEnd(self.tagName)
                return "bodydata"
            return self.maybeBodyData()
        else:
            self.attrval += byte

    def end_messyattr(self):
        if self.attrval:
            self.tagAttributes[self.attrname] = self.attrval

    def begin_afterslash(self, byte):
        self._after_slash_closed = 0

    def do_afterslash(self, byte):
        # this state is only after a self-terminating slash, e.g. <foo/>
        if self._after_slash_closed:
            self._parseError("Mal-formed")  # XXX When does this happen??
        if byte != ">":
            if self.beExtremelyLenient:
                return
            else:
                self._parseError("No data allowed after '/'")
        self._after_slash_closed = 1
        self.gotTagStart(self.tagName, self.tagAttributes)
        self.gotTagEnd(self.tagName)
        # don't need maybeBodyData here because there better not be
        # any javascript code after a <script/>... we'll see :(
        return "bodydata"

    def begin_bodydata(self, byte):
        if self._leadingBodyData:
            self.bodydata = self._leadingBodyData
            del self._leadingBodyData
        else:
            self.bodydata = ""

    def do_bodydata(self, byte):
        if byte == "<":
            return "tagstart"
        if byte == "&":
            return "entityref"
        self.bodydata += byte

    def end_bodydata(self):
        self.gotText(self.bodydata)
        self.bodydata = ""

    def do_waitforendscript(self, byte):
        if byte == "<":
            return "waitscriptendtag"
        self.bodydata += byte

    def begin_waitscriptendtag(self, byte):
        self.temptagdata = ""
        self.tagName = ""
        self.endtag = 0

    def do_waitscriptendtag(self, byte):
        # 1 enforce / as first byte read
        # 2 enforce following bytes to be subset of "script" until
        #   tagName == "script"
        #   2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
        # 3 spaces can happen anywhere, they're ignored
        #   e.g. < / script >
        # 4 anything else causes all data I've read to be moved to the
        #   bodydata, and switch back to waitforendscript state

        # If it turns out this _isn't_ a </script>, we need to
        # remember all the data we've been through so we can append it
        # to bodydata
        self.temptagdata += byte

        # 1
        if byte == "/":
            self.endtag = True
        elif not self.endtag:
            self.bodydata += "<" + self.temptagdata
            return "waitforendscript"
        # 2
        elif byte.isalnum() or byte in identChars:
            self.tagName += byte
            if not "script".startswith(self.tagName):
                self.bodydata += "<" + self.temptagdata
                return "waitforendscript"
            elif self.tagName == "script":
                self.gotText(self.bodydata)
                self.gotTagEnd(self.tagName)
                return "waitforgt"
        # 3
        elif byte.isspace():
            return "waitscriptendtag"
        # 4
        else:
            self.bodydata += "<" + self.temptagdata
            return "waitforendscript"

    def begin_entityref(self, byte):
        self.erefbuf = ""
        self.erefextra = ""  # extra bit for lenient mode

    def do_entityref(self, byte):
        if byte.isspace() or byte == "<":
            if self.beExtremelyLenient:
                # '&foo' probably was '&amp;foo'
                if self.erefbuf and self.erefbuf != "amp":
                    self.erefextra = self.erefbuf
                self.erefbuf = "amp"
                if byte == "<":
                    return "tagstart"
                else:
                    self.erefextra += byte
                    return "spacebodydata"
            self._parseError("Bad entity reference")
        elif byte != ";":
            self.erefbuf += byte
        else:
            return "bodydata"

    def end_entityref(self):
        self.gotEntityReference(self.erefbuf)

    # hacky support for space after & in entityref in beExtremelyLenient
    # state should only happen in that case
    def begin_spacebodydata(self, byte):
        self.bodydata = self.erefextra
        self.erefextra = None

    do_spacebodydata = do_bodydata
    end_spacebodydata = end_bodydata

    # Sorta SAX-ish API

    def gotTagStart(self, name, attributes):
        """Encountered an opening tag.

        Default behaviour is to print."""
        print("begin", name, attributes)

    def gotText(self, data):
        """Encountered text

        Default behaviour is to print."""
        print("text:", repr(data))

    def gotEntityReference(self, entityRef):
        """Encountered mnemonic entity reference

        Default behaviour is to print."""
        print("entityRef: &%s;" % entityRef)

    def gotComment(self, comment):
        """Encountered comment.

        Default behaviour is to ignore."""
        pass

    def gotCData(self, cdata):
        """Encountered CDATA

        Default behaviour is to call the gotText method"""
        self.gotText(cdata)

    def gotDoctype(self, doctype):
        """Encountered DOCTYPE

        This is really grotty: it basically just gives you everything between
        '<!DOCTYPE' and '>' as an argument.
        """
        print("!DOCTYPE", repr(doctype))

    def gotTagEnd(self, name):
        """Encountered closing tag

        Default behaviour is to print."""
        print("end", name)