123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686 |
- // Copyright 2018 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package text
- import (
- "bytes"
- "fmt"
- "io"
- "strconv"
- "unicode/utf8"
- "google.golang.org/protobuf/internal/errors"
- )
- // Decoder is a token-based textproto decoder.
- type Decoder struct {
- // lastCall is last method called, either readCall or peekCall.
- // Initial value is readCall.
- lastCall call
- // lastToken contains the last read token.
- lastToken Token
- // lastErr contains the last read error.
- lastErr error
- // openStack is a stack containing the byte characters for MessageOpen and
- // ListOpen kinds. The top of stack represents the message or the list that
- // the current token is nested in. An empty stack means the current token is
- // at the top level message. The characters '{' and '<' both represent the
- // MessageOpen kind.
- openStack []byte
- // orig is used in reporting line and column.
- orig []byte
- // in contains the unconsumed input.
- in []byte
- }
- // NewDecoder returns a Decoder to read the given []byte.
- func NewDecoder(b []byte) *Decoder {
- return &Decoder{orig: b, in: b}
- }
- // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
- var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
- // call specifies which Decoder method was invoked.
- type call uint8
- const (
- readCall call = iota
- peekCall
- )
- // Peek looks ahead and returns the next token and error without advancing a read.
- func (d *Decoder) Peek() (Token, error) {
- defer func() { d.lastCall = peekCall }()
- if d.lastCall == readCall {
- d.lastToken, d.lastErr = d.Read()
- }
- return d.lastToken, d.lastErr
- }
- // Read returns the next token.
- // It will return an error if there is no valid token.
- func (d *Decoder) Read() (Token, error) {
- defer func() { d.lastCall = readCall }()
- if d.lastCall == peekCall {
- return d.lastToken, d.lastErr
- }
- tok, err := d.parseNext(d.lastToken.kind)
- if err != nil {
- return Token{}, err
- }
- switch tok.kind {
- case comma, semicolon:
- tok, err = d.parseNext(tok.kind)
- if err != nil {
- return Token{}, err
- }
- }
- d.lastToken = tok
- return tok, nil
- }
- const (
- mismatchedFmt = "mismatched close character %q"
- unexpectedFmt = "unexpected character %q"
- )
- // parseNext parses the next Token based on given last kind.
- func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
- // Trim leading spaces.
- d.consume(0)
- isEOF := false
- if len(d.in) == 0 {
- isEOF = true
- }
- switch lastKind {
- case EOF:
- return d.consumeToken(EOF, 0, 0), nil
- case bof:
- // Start of top level message. Next token can be EOF or Name.
- if isEOF {
- return d.consumeToken(EOF, 0, 0), nil
- }
- return d.parseFieldName()
- case Name:
- // Next token can be MessageOpen, ListOpen or Scalar.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case '{', '<':
- d.pushOpenStack(ch)
- return d.consumeToken(MessageOpen, 1, 0), nil
- case '[':
- d.pushOpenStack(ch)
- return d.consumeToken(ListOpen, 1, 0), nil
- default:
- return d.parseScalar()
- }
- case Scalar:
- openKind, closeCh := d.currentOpenKind()
- switch openKind {
- case bof:
- // Top level message.
- // Next token can be EOF, comma, semicolon or Name.
- if isEOF {
- return d.consumeToken(EOF, 0, 0), nil
- }
- switch d.in[0] {
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- case ';':
- return d.consumeToken(semicolon, 1, 0), nil
- default:
- return d.parseFieldName()
- }
- case MessageOpen:
- // Next token can be MessageClose, comma, semicolon or Name.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case closeCh:
- d.popOpenStack()
- return d.consumeToken(MessageClose, 1, 0), nil
- case otherCloseChar[closeCh]:
- return Token{}, d.newSyntaxError(mismatchedFmt, ch)
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- case ';':
- return d.consumeToken(semicolon, 1, 0), nil
- default:
- return d.parseFieldName()
- }
- case ListOpen:
- // Next token can be ListClose or comma.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case ']':
- d.popOpenStack()
- return d.consumeToken(ListClose, 1, 0), nil
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- default:
- return Token{}, d.newSyntaxError(unexpectedFmt, ch)
- }
- }
- case MessageOpen:
- // Next token can be MessageClose or Name.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- _, closeCh := d.currentOpenKind()
- switch ch := d.in[0]; ch {
- case closeCh:
- d.popOpenStack()
- return d.consumeToken(MessageClose, 1, 0), nil
- case otherCloseChar[closeCh]:
- return Token{}, d.newSyntaxError(mismatchedFmt, ch)
- default:
- return d.parseFieldName()
- }
- case MessageClose:
- openKind, closeCh := d.currentOpenKind()
- switch openKind {
- case bof:
- // Top level message.
- // Next token can be EOF, comma, semicolon or Name.
- if isEOF {
- return d.consumeToken(EOF, 0, 0), nil
- }
- switch ch := d.in[0]; ch {
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- case ';':
- return d.consumeToken(semicolon, 1, 0), nil
- default:
- return d.parseFieldName()
- }
- case MessageOpen:
- // Next token can be MessageClose, comma, semicolon or Name.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case closeCh:
- d.popOpenStack()
- return d.consumeToken(MessageClose, 1, 0), nil
- case otherCloseChar[closeCh]:
- return Token{}, d.newSyntaxError(mismatchedFmt, ch)
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- case ';':
- return d.consumeToken(semicolon, 1, 0), nil
- default:
- return d.parseFieldName()
- }
- case ListOpen:
- // Next token can be ListClose or comma
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case closeCh:
- d.popOpenStack()
- return d.consumeToken(ListClose, 1, 0), nil
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- default:
- return Token{}, d.newSyntaxError(unexpectedFmt, ch)
- }
- }
- case ListOpen:
- // Next token can be ListClose, MessageStart or Scalar.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case ']':
- d.popOpenStack()
- return d.consumeToken(ListClose, 1, 0), nil
- case '{', '<':
- d.pushOpenStack(ch)
- return d.consumeToken(MessageOpen, 1, 0), nil
- default:
- return d.parseScalar()
- }
- case ListClose:
- openKind, closeCh := d.currentOpenKind()
- switch openKind {
- case bof:
- // Top level message.
- // Next token can be EOF, comma, semicolon or Name.
- if isEOF {
- return d.consumeToken(EOF, 0, 0), nil
- }
- switch ch := d.in[0]; ch {
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- case ';':
- return d.consumeToken(semicolon, 1, 0), nil
- default:
- return d.parseFieldName()
- }
- case MessageOpen:
- // Next token can be MessageClose, comma, semicolon or Name.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case closeCh:
- d.popOpenStack()
- return d.consumeToken(MessageClose, 1, 0), nil
- case otherCloseChar[closeCh]:
- return Token{}, d.newSyntaxError(mismatchedFmt, ch)
- case ',':
- return d.consumeToken(comma, 1, 0), nil
- case ';':
- return d.consumeToken(semicolon, 1, 0), nil
- default:
- return d.parseFieldName()
- }
- default:
- // It is not possible to have this case. Let it panic below.
- }
- case comma, semicolon:
- openKind, closeCh := d.currentOpenKind()
- switch openKind {
- case bof:
- // Top level message. Next token can be EOF or Name.
- if isEOF {
- return d.consumeToken(EOF, 0, 0), nil
- }
- return d.parseFieldName()
- case MessageOpen:
- // Next token can be MessageClose or Name.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case closeCh:
- d.popOpenStack()
- return d.consumeToken(MessageClose, 1, 0), nil
- case otherCloseChar[closeCh]:
- return Token{}, d.newSyntaxError(mismatchedFmt, ch)
- default:
- return d.parseFieldName()
- }
- case ListOpen:
- if lastKind == semicolon {
- // It is not be possible to have this case as logic here
- // should not have produced a semicolon Token when inside a
- // list. Let it panic below.
- break
- }
- // Next token can be MessageOpen or Scalar.
- if isEOF {
- return Token{}, ErrUnexpectedEOF
- }
- switch ch := d.in[0]; ch {
- case '{', '<':
- d.pushOpenStack(ch)
- return d.consumeToken(MessageOpen, 1, 0), nil
- default:
- return d.parseScalar()
- }
- }
- }
- line, column := d.Position(len(d.orig) - len(d.in))
- panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
- }
- var otherCloseChar = map[byte]byte{
- '}': '>',
- '>': '}',
- }
- // currentOpenKind indicates whether current position is inside a message, list
- // or top-level message by returning MessageOpen, ListOpen or bof respectively.
- // If the returned kind is either a MessageOpen or ListOpen, it also returns the
- // corresponding closing character.
- func (d *Decoder) currentOpenKind() (Kind, byte) {
- if len(d.openStack) == 0 {
- return bof, 0
- }
- openCh := d.openStack[len(d.openStack)-1]
- switch openCh {
- case '{':
- return MessageOpen, '}'
- case '<':
- return MessageOpen, '>'
- case '[':
- return ListOpen, ']'
- }
- panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
- }
- func (d *Decoder) pushOpenStack(ch byte) {
- d.openStack = append(d.openStack, ch)
- }
- func (d *Decoder) popOpenStack() {
- d.openStack = d.openStack[:len(d.openStack)-1]
- }
- // parseFieldName parses field name and separator.
- func (d *Decoder) parseFieldName() (tok Token, err error) {
- defer func() {
- if err == nil && d.tryConsumeChar(':') {
- tok.attrs |= hasSeparator
- }
- }()
- // Extension or Any type URL.
- if d.in[0] == '[' {
- return d.parseTypeName()
- }
- // Identifier.
- if size := parseIdent(d.in, false); size > 0 {
- return d.consumeToken(Name, size, uint8(IdentName)), nil
- }
- // Field number. Identify if input is a valid number that is not negative
- // and is decimal integer within 32-bit range.
- if num := parseNumber(d.in); num.size > 0 {
- str := num.string(d.in)
- if !num.neg && num.kind == numDec {
- if _, err := strconv.ParseInt(str, 10, 32); err == nil {
- return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
- }
- }
- return Token{}, d.newSyntaxError("invalid field number: %s", str)
- }
- return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
- }
- // parseTypeName parses Any type URL or extension field name. The name is
- // enclosed in [ and ] characters. The C++ parser does not handle many legal URL
- // strings. This implementation is more liberal and allows for the pattern
- // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
- // in between [ ], '.', '/' and the sub names.
- func (d *Decoder) parseTypeName() (Token, error) {
- startPos := len(d.orig) - len(d.in)
- // Use alias s to advance first in order to use d.in for error handling.
- // Caller already checks for [ as first character.
- s := consume(d.in[1:], 0)
- if len(s) == 0 {
- return Token{}, ErrUnexpectedEOF
- }
- var name []byte
- for len(s) > 0 && isTypeNameChar(s[0]) {
- name = append(name, s[0])
- s = s[1:]
- }
- s = consume(s, 0)
- var closed bool
- for len(s) > 0 && !closed {
- switch {
- case s[0] == ']':
- s = s[1:]
- closed = true
- case s[0] == '/', s[0] == '.':
- if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
- return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
- d.orig[startPos:len(d.orig)-len(s)+1])
- }
- name = append(name, s[0])
- s = s[1:]
- s = consume(s, 0)
- for len(s) > 0 && isTypeNameChar(s[0]) {
- name = append(name, s[0])
- s = s[1:]
- }
- s = consume(s, 0)
- default:
- return Token{}, d.newSyntaxError(
- "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
- }
- }
- if !closed {
- return Token{}, ErrUnexpectedEOF
- }
- // First character cannot be '.'. Last character cannot be '.' or '/'.
- size := len(name)
- if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
- return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
- d.orig[startPos:len(d.orig)-len(s)])
- }
- d.in = s
- endPos := len(d.orig) - len(d.in)
- d.consume(0)
- return Token{
- kind: Name,
- attrs: uint8(TypeName),
- pos: startPos,
- raw: d.orig[startPos:endPos],
- str: string(name),
- }, nil
- }
- func isTypeNameChar(b byte) bool {
- return (b == '-' || b == '_' ||
- ('0' <= b && b <= '9') ||
- ('a' <= b && b <= 'z') ||
- ('A' <= b && b <= 'Z'))
- }
- func isWhiteSpace(b byte) bool {
- switch b {
- case ' ', '\n', '\r', '\t':
- return true
- default:
- return false
- }
- }
- // parseIdent parses an unquoted proto identifier and returns size.
- // If allowNeg is true, it allows '-' to be the first character in the
- // identifier. This is used when parsing literal values like -infinity, etc.
- // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
- func parseIdent(input []byte, allowNeg bool) int {
- var size int
- s := input
- if len(s) == 0 {
- return 0
- }
- if allowNeg && s[0] == '-' {
- s = s[1:]
- size++
- if len(s) == 0 {
- return 0
- }
- }
- switch {
- case s[0] == '_',
- 'a' <= s[0] && s[0] <= 'z',
- 'A' <= s[0] && s[0] <= 'Z':
- s = s[1:]
- size++
- default:
- return 0
- }
- for len(s) > 0 && (s[0] == '_' ||
- 'a' <= s[0] && s[0] <= 'z' ||
- 'A' <= s[0] && s[0] <= 'Z' ||
- '0' <= s[0] && s[0] <= '9') {
- s = s[1:]
- size++
- }
- if len(s) > 0 && !isDelim(s[0]) {
- return 0
- }
- return size
- }
- // parseScalar parses for a string, literal or number value.
- func (d *Decoder) parseScalar() (Token, error) {
- if d.in[0] == '"' || d.in[0] == '\'' {
- return d.parseStringValue()
- }
- if tok, ok := d.parseLiteralValue(); ok {
- return tok, nil
- }
- if tok, ok := d.parseNumberValue(); ok {
- return tok, nil
- }
- return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
- }
- // parseLiteralValue parses a literal value. A literal value is used for
- // bools, special floats and enums. This function simply identifies that the
- // field value is a literal.
- func (d *Decoder) parseLiteralValue() (Token, bool) {
- size := parseIdent(d.in, true)
- if size == 0 {
- return Token{}, false
- }
- return d.consumeToken(Scalar, size, literalValue), true
- }
- // consumeToken constructs a Token for given Kind from d.in and consumes given
- // size-length from it.
- func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
- // Important to compute raw and pos before consuming.
- tok := Token{
- kind: kind,
- attrs: attrs,
- pos: len(d.orig) - len(d.in),
- raw: d.in[:size],
- }
- d.consume(size)
- return tok
- }
- // newSyntaxError returns a syntax error with line and column information for
- // current position.
- func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
- e := errors.New(f, x...)
- line, column := d.Position(len(d.orig) - len(d.in))
- return errors.New("syntax error (line %d:%d): %v", line, column, e)
- }
- // Position returns line and column number of given index of the original input.
- // It will panic if index is out of range.
- func (d *Decoder) Position(idx int) (line int, column int) {
- b := d.orig[:idx]
- line = bytes.Count(b, []byte("\n")) + 1
- if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
- b = b[i+1:]
- }
- column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
- return line, column
- }
- func (d *Decoder) tryConsumeChar(c byte) bool {
- if len(d.in) > 0 && d.in[0] == c {
- d.consume(1)
- return true
- }
- return false
- }
- // consume consumes n bytes of input and any subsequent whitespace or comments.
- func (d *Decoder) consume(n int) {
- d.in = consume(d.in, n)
- return
- }
- // consume consumes n bytes of input and any subsequent whitespace or comments.
- func consume(b []byte, n int) []byte {
- b = b[n:]
- for len(b) > 0 {
- switch b[0] {
- case ' ', '\n', '\r', '\t':
- b = b[1:]
- case '#':
- if i := bytes.IndexByte(b, '\n'); i >= 0 {
- b = b[i+len("\n"):]
- } else {
- b = nil
- }
- default:
- return b
- }
- }
- return b
- }
- // errId extracts a byte sequence that looks like an invalid ID
- // (for the purposes of error reporting).
- func errId(seq []byte) []byte {
- const maxLen = 32
- for i := 0; i < len(seq); {
- if i > maxLen {
- return append(seq[:i:i], "…"...)
- }
- r, size := utf8.DecodeRune(seq[i:])
- if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
- if i == 0 {
- // Either the first byte is invalid UTF-8 or a
- // delimiter, or the first rune is non-ASCII.
- // Return it as-is.
- i = size
- }
- return seq[:i:i]
- }
- i += size
- }
- // No delimiter found.
- return seq
- }
- // isDelim returns true if given byte is a delimiter character.
- func isDelim(c byte) bool {
- return !(c == '-' || c == '+' || c == '.' || c == '_' ||
- ('a' <= c && c <= 'z') ||
- ('A' <= c && c <= 'Z') ||
- ('0' <= c && c <= '9'))
- }
|