/* * Copyright (c) 2007-2014, Lloyd Hilaiel * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "yajl_lex.h" #include "yajl_buf.h" #include #include #include #include #ifdef YAJL_LEXER_DEBUG static const char * tokToStr(yajl_tok tok) { switch (tok) { case yajl_tok_bool: return "bool"; case yajl_tok_colon: return "colon"; case yajl_tok_comma: return "comma"; case yajl_tok_eof: return "eof"; case yajl_tok_error: return "error"; case yajl_tok_left_brace: return "brace"; case yajl_tok_left_bracket: return "bracket"; case yajl_tok_null: return "null"; case yajl_tok_inf: return "infinity"; case yajl_tok_minus_inf: return "-infinity"; case yajl_tok_integer: return "integer"; case yajl_tok_double: return "double"; case yajl_tok_right_brace: return "brace"; case yajl_tok_right_bracket: return "bracket"; case yajl_tok_string: return "string"; case yajl_tok_string_with_escapes: return "string_with_escapes"; } return "unknown"; } #endif /* Impact of the stream parsing feature on the lexer: * * YAJL support stream parsing. That is, the ability to parse the first * bits of a chunk of JSON before the last bits are available (still on * the network or disk). This makes the lexer more complex. The * responsibility of the lexer is to handle transparently the case where * a chunk boundary falls in the middle of a token. This is * accomplished is via a buffer and a character reading abstraction. * * Overview of implementation * * When we lex to end of input string before end of token is hit, we * copy all of the input text composing the token into our lexBuf. * * Every time we read a character, we do so through the readChar function. * readChar's responsibility is to handle pulling all chars from the buffer * before pulling chars from input text */ struct yajl_lexer_t { /* the overal line and char offset into the data */ size_t lineOff; size_t charOff; /* error */ yajl_lex_error error; /* a input buffer to handle the case where a token is spread over * multiple chunks */ yajl_buf buf; /* in the case where we have data in the lexBuf, bufOff holds * the current offset into the lexBuf. */ size_t bufOff; /* are we using the lex buf? */ unsigned int bufInUse; /* shall we allow comments? */ unsigned int allowComments; /* shall we validate utf8 inside strings? */ unsigned int validateUTF8; yajl_alloc_funcs * alloc; }; #define readChar(lxr, txt, off) \ (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \ (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \ ((txt)[(*(off))++])) #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--)) yajl_lexer yajl_lex_alloc(yajl_alloc_funcs * alloc, unsigned int allowComments, unsigned int validateUTF8) { yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t)); memset((void *) lxr, 0, sizeof(struct yajl_lexer_t)); lxr->buf = yajl_buf_alloc(alloc); lxr->allowComments = allowComments; lxr->validateUTF8 = validateUTF8; lxr->alloc = alloc; return lxr; } void yajl_lex_free(yajl_lexer lxr) { yajl_buf_free(lxr->buf); YA_FREE(lxr->alloc, lxr); return; } /* a lookup table which lets us quickly determine three things: * VEC - valid escaped control char * note. the solidus '/' may be escaped or not. * IJC - invalid json char * VHC - valid hex char * NFP - needs further processing (from a string scanning perspective) * NUC - needs utf8 checking when enabled (from a string scanning perspective) */ #define VEC 0x01 #define IJC 0x02 #define VHC 0x04 #define NFP 0x08 #define NUC 0x10 static const char charLookupTable[256] = { /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , /*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 , /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC , /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC , /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 , /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 , /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , /*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 , /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 , /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 , /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC }; /** process a variable length utf8 encoded codepoint. * * returns: * yajl_tok_string - if valid utf8 char was parsed and offset was * advanced * yajl_tok_eof - if end of input was hit before validation could * complete * yajl_tok_error - if invalid utf8 was encountered * * NOTE: on error the offset will point to the first char of the * invalid utf8 */ #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; } static yajl_tok yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t * offset, unsigned char curChar) { if (curChar <= 0x7f) { /* single byte */ return yajl_tok_string; } else if ((curChar >> 5) == 0x6) { /* two byte */ UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) return yajl_tok_string; } else if ((curChar >> 4) == 0x0e) { /* three byte */ UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) { UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) return yajl_tok_string; } } else if ((curChar >> 3) == 0x1e) { /* four byte */ UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) { UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) { UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) return yajl_tok_string; } } } return yajl_tok_error; } /* lex a string. input is the lexer, pointer to beginning of * json text, and start of string (offset). * a token is returned which has the following meanings: * yajl_tok_string: lex of string was successful. offset points to * terminating '"'. * yajl_tok_eof: end of text was encountered before we could complete * the lex. * yajl_tok_error: embedded in the string were unallowable chars. offset * points to the offending char */ #define STR_CHECK_EOF \ if (*offset >= jsonTextLen) { \ tok = yajl_tok_eof; \ goto finish_string_lex; \ } /** scan a string for interesting characters that might need further * review. return the number of chars that are uninteresting and can * be skipped. * (lth) hi world, any thoughts on how to make this routine faster? */ static size_t yajl_string_scan(const unsigned char * buf, size_t len, int utf8check) { unsigned char mask = IJC|NFP|(utf8check ? NUC : 0); size_t skip = 0; while (skip < len && !(charLookupTable[*buf] & mask)) { skip++; buf++; } return skip; } static yajl_tok yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t * offset) { yajl_tok tok = yajl_tok_error; int hasEscapes = 0; for (;;) { unsigned char curChar; /* now jump into a faster scanning routine to skip as much * of the buffers as possible */ { const unsigned char * p; size_t len; if ((lexer->bufInUse && yajl_buf_len(lexer->buf) && lexer->bufOff < yajl_buf_len(lexer->buf))) { p = ((const unsigned char *) yajl_buf_data(lexer->buf) + (lexer->bufOff)); len = yajl_buf_len(lexer->buf) - lexer->bufOff; lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8); } else if (*offset < jsonTextLen) { p = jsonText + *offset; len = jsonTextLen - *offset; *offset += yajl_string_scan(p, len, lexer->validateUTF8); } } STR_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); /* quote terminates */ if (curChar == '"') { tok = yajl_tok_string; break; } /* backslash escapes a set of control chars, */ else if (curChar == '\\') { hasEscapes = 1; STR_CHECK_EOF; /* special case \u */ curChar = readChar(lexer, jsonText, offset); if (curChar == 'u') { unsigned int i = 0; for (i=0;i<4;i++) { STR_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if (!(charLookupTable[curChar] & VHC)) { /* back up to offending char */ unreadChar(lexer, offset); lexer->error = yajl_lex_string_invalid_hex_char; goto finish_string_lex; } } } else if (!(charLookupTable[curChar] & VEC)) { /* back up to offending char */ unreadChar(lexer, offset); lexer->error = yajl_lex_string_invalid_escaped_char; goto finish_string_lex; } } /* when not validating UTF8 it's a simple table lookup to determine * if the present character is invalid */ else if(charLookupTable[curChar] & IJC) { /* back up to offending char */ unreadChar(lexer, offset); lexer->error = yajl_lex_string_invalid_json_char; goto finish_string_lex; } /* when in validate UTF8 mode we need to do some extra work */ else if (lexer->validateUTF8) { yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen, offset, curChar); if (t == yajl_tok_eof) { tok = yajl_tok_eof; goto finish_string_lex; } else if (t == yajl_tok_error) { lexer->error = yajl_lex_string_invalid_utf8; goto finish_string_lex; } } /* accept it, and move on */ } finish_string_lex: /* tell our buddy, the parser, wether he needs to process this string * again */ if (hasEscapes && tok == yajl_tok_string) { tok = yajl_tok_string_with_escapes; } return tok; } #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof; static yajl_tok yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t * offset) { /** XXX: numbers are the only entities in json that we must lex * _beyond_ in order to know that they are complete. There * is an ambiguous case for integers at EOF. */ unsigned char c; yajl_tok tok = yajl_tok_integer; RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); /* optional leading minus */ char minus = 0; if (c == '-') { minus = 1; RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } /* a single zero, or a series of integers */ if (c == '0') { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } else if (c >= '1' && c <= '9') { do { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } while (c >= '0' && c <= '9'); } else if (c == 'i') { if (readChar(lexer, jsonText, offset) != 'n') { unreadChar(lexer, offset); lexer->error = yajl_lex_invalid_infinity; return yajl_tok_error; } if (readChar(lexer, jsonText, offset) != 'f') { unreadChar(lexer, offset); lexer->error = yajl_lex_invalid_infinity; return yajl_tok_error; } if (minus) { return yajl_tok_minus_inf; } else { return yajl_tok_inf; } } else { unreadChar(lexer, offset); lexer->error = yajl_lex_missing_integer_after_minus; return yajl_tok_error; } /* optional fraction (indicates this is floating point) */ if (c == '.') { int numRd = 0; RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); while (c >= '0' && c <= '9') { numRd++; RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } if (!numRd) { unreadChar(lexer, offset); lexer->error = yajl_lex_missing_integer_after_decimal; return yajl_tok_error; } tok = yajl_tok_double; } /* optional exponent (indicates this is floating point) */ if (c == 'e' || c == 'E') { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); /* optional sign */ if (c == '+' || c == '-') { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } if (c >= '0' && c <= '9') { do { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } while (c >= '0' && c <= '9'); } else { unreadChar(lexer, offset); lexer->error = yajl_lex_missing_integer_after_exponent; return yajl_tok_error; } tok = yajl_tok_double; } /* we always go "one too far" */ unreadChar(lexer, offset); return tok; } static yajl_tok yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t * offset) { unsigned char c; yajl_tok tok = yajl_tok_comment; RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); /* either slash or star expected */ if (c == '/') { /* now we throw away until end of line */ do { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); } while (c != '\n'); } else if (c == '*') { /* now we throw away until end of comment */ for (;;) { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); if (c == '*') { RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); if (c == '/') { break; } else { unreadChar(lexer, offset); } } } } else { lexer->error = yajl_lex_invalid_char; tok = yajl_tok_error; } return tok; } #define MATCH(want_value, target_token) \ const char * want = want_value; \ do { \ if (*offset >= jsonTextLen) { \ tok = yajl_tok_eof; \ goto lexed; \ } \ c = readChar(lexer, jsonText, offset); \ if (c != *want) { \ unreadChar(lexer, offset); \ lexer->error = yajl_lex_invalid_string; \ tok = yajl_tok_error; \ goto lexed; \ } \ } while (*(++want)); \ tok = target_token; \ goto lexed; yajl_tok yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t * offset, const unsigned char ** outBuf, size_t * outLen) { yajl_tok tok = yajl_tok_error; unsigned char c; size_t startOffset = *offset; *outBuf = NULL; *outLen = 0; for (;;) { assert(*offset <= jsonTextLen); if (*offset >= jsonTextLen) { tok = yajl_tok_eof; goto lexed; } c = readChar(lexer, jsonText, offset); switch (c) { case '{': tok = yajl_tok_left_bracket; goto lexed; case '}': tok = yajl_tok_right_bracket; goto lexed; case '[': tok = yajl_tok_left_brace; goto lexed; case ']': tok = yajl_tok_right_brace; goto lexed; case ',': tok = yajl_tok_comma; goto lexed; case ':': tok = yajl_tok_colon; goto lexed; case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': startOffset++; break; case 't': { MATCH("rue", yajl_tok_bool); } case 'f': { MATCH("alse", yajl_tok_bool); } case 'n': { MATCH("ull", yajl_tok_null); } case '"': { tok = yajl_lex_string(lexer, (const unsigned char *) jsonText, jsonTextLen, offset); goto lexed; } case '-': case 'i': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { /* integer parsing wants to start from the beginning */ unreadChar(lexer, offset); tok = yajl_lex_number(lexer, (const unsigned char *) jsonText, jsonTextLen, offset); goto lexed; } case '/': /* hey, look, a probable comment! If comments are disabled * it's an error. */ if (!lexer->allowComments) { unreadChar(lexer, offset); lexer->error = yajl_lex_unallowed_comment; tok = yajl_tok_error; goto lexed; } /* if comments are enabled, then we should try to lex * the thing. possible outcomes are * - successful lex (tok_comment, which means continue), * - malformed comment opening (slash not followed by * '*' or '/') (tok_error) * - eof hit. (tok_eof) */ tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText, jsonTextLen, offset); if (tok == yajl_tok_comment) { /* "error" is silly, but that's the initial * state of tok. guilty until proven innocent. */ tok = yajl_tok_error; yajl_buf_clear(lexer->buf); lexer->bufInUse = 0; startOffset = *offset; break; } /* hit error or eof, bail */ goto lexed; default: lexer->error = yajl_lex_invalid_char; tok = yajl_tok_error; goto lexed; } } lexed: /* need to append to buffer if the buffer is in use or * if it's an EOF token */ if (tok == yajl_tok_eof || lexer->bufInUse) { if (!lexer->bufInUse) yajl_buf_clear(lexer->buf); lexer->bufInUse = 1; yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset); lexer->bufOff = 0; if (tok != yajl_tok_eof) { *outBuf = yajl_buf_data(lexer->buf); *outLen = yajl_buf_len(lexer->buf); lexer->bufInUse = 0; } } else if (tok != yajl_tok_error) { *outBuf = jsonText + startOffset; *outLen = *offset - startOffset; } /* special case for strings. skip the quotes. */ if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes) { assert(*outLen >= 2); (*outBuf)++; *outLen -= 2; } #ifdef YAJL_LEXER_DEBUG if (tok == yajl_tok_error) { printf("lexical error: %s\n", yajl_lex_error_to_string(yajl_lex_get_error(lexer))); } else if (tok == yajl_tok_eof) { printf("EOF hit\n"); } else { printf("lexed %s: '", tokToStr(tok)); fwrite(*outBuf, 1, *outLen, stdout); printf("'\n"); } #endif return tok; } const char * yajl_lex_error_to_string(yajl_lex_error error) { switch (error) { case yajl_lex_e_ok: return "ok, no error"; case yajl_lex_string_invalid_utf8: return "invalid bytes in UTF8 string."; case yajl_lex_string_invalid_escaped_char: return "inside a string, '\\' occurs before a character " "which it may not."; case yajl_lex_string_invalid_json_char: return "invalid character inside string."; case yajl_lex_string_invalid_hex_char: return "invalid (non-hex) character occurs after '\\u' inside " "string."; case yajl_lex_invalid_char: return "invalid char in json text."; case yajl_lex_invalid_string: return "invalid string in json text."; case yajl_lex_missing_integer_after_exponent: return "malformed number, a digit is required after the exponent."; case yajl_lex_missing_integer_after_decimal: return "malformed number, a digit is required after the " "decimal point."; case yajl_lex_missing_integer_after_minus: return "malformed number, a digit is required after the " "minus sign."; case yajl_lex_invalid_infinity: return "malformed number, a token inf required for number starting " "from 'i'"; case yajl_lex_unallowed_comment: return "probable comment found in input text, comments are " "not enabled."; } return "unknown error code"; } /** allows access to more specific information about the lexical * error when yajl_lex_lex returns yajl_tok_error. */ yajl_lex_error yajl_lex_get_error(yajl_lexer lexer) { if (lexer == NULL) return (yajl_lex_error) -1; return lexer->error; } size_t yajl_lex_current_line(yajl_lexer lexer) { return lexer->lineOff; } size_t yajl_lex_current_char(yajl_lexer lexer) { return lexer->charOff; } yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t offset) { const unsigned char * outBuf; size_t outLen; size_t bufLen = yajl_buf_len(lexer->buf); size_t bufOff = lexer->bufOff; unsigned int bufInUse = lexer->bufInUse; yajl_tok tok; tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset, &outBuf, &outLen); lexer->bufOff = bufOff; lexer->bufInUse = bufInUse; yajl_buf_truncate(lexer->buf, bufLen); return tok; } size_t yajl_lex_buf_capacity(yajl_lexer lexer) { return yajl_buf_capacity(lexer->buf); }