yajl_lex.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. /*
  2. * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include "yajl_lex.h"
  17. #include "yajl_buf.h"
  18. #include <stdlib.h>
  19. #include <stdio.h>
  20. #include <assert.h>
  21. #include <string.h>
  22. #ifdef YAJL_LEXER_DEBUG
  23. static const char *
  24. tokToStr(yajl_tok tok)
  25. {
  26. switch (tok) {
  27. case yajl_tok_bool: return "bool";
  28. case yajl_tok_colon: return "colon";
  29. case yajl_tok_comma: return "comma";
  30. case yajl_tok_eof: return "eof";
  31. case yajl_tok_error: return "error";
  32. case yajl_tok_left_brace: return "brace";
  33. case yajl_tok_left_bracket: return "bracket";
  34. case yajl_tok_null: return "null";
  35. case yajl_tok_inf: return "infinity";
  36. case yajl_tok_minus_inf: return "-infinity";
  37. case yajl_tok_integer: return "integer";
  38. case yajl_tok_double: return "double";
  39. case yajl_tok_right_brace: return "brace";
  40. case yajl_tok_right_bracket: return "bracket";
  41. case yajl_tok_string: return "string";
  42. case yajl_tok_string_with_escapes: return "string_with_escapes";
  43. }
  44. return "unknown";
  45. }
  46. #endif
  47. /* Impact of the stream parsing feature on the lexer:
  48. *
  49. * YAJL support stream parsing. That is, the ability to parse the first
  50. * bits of a chunk of JSON before the last bits are available (still on
  51. * the network or disk). This makes the lexer more complex. The
  52. * responsibility of the lexer is to handle transparently the case where
  53. * a chunk boundary falls in the middle of a token. This is
  54. * accomplished is via a buffer and a character reading abstraction.
  55. *
  56. * Overview of implementation
  57. *
  58. * When we lex to end of input string before end of token is hit, we
  59. * copy all of the input text composing the token into our lexBuf.
  60. *
  61. * Every time we read a character, we do so through the readChar function.
  62. * readChar's responsibility is to handle pulling all chars from the buffer
  63. * before pulling chars from input text
  64. */
  65. struct yajl_lexer_t {
  66. /* the overal line and char offset into the data */
  67. size_t lineOff;
  68. size_t charOff;
  69. /* error */
  70. yajl_lex_error error;
  71. /* a input buffer to handle the case where a token is spread over
  72. * multiple chunks */
  73. yajl_buf buf;
  74. /* in the case where we have data in the lexBuf, bufOff holds
  75. * the current offset into the lexBuf. */
  76. size_t bufOff;
  77. /* are we using the lex buf? */
  78. unsigned int bufInUse;
  79. /* shall we allow comments? */
  80. unsigned int allowComments;
  81. /* shall we validate utf8 inside strings? */
  82. unsigned int validateUTF8;
  83. yajl_alloc_funcs * alloc;
  84. };
  85. #define readChar(lxr, txt, off) \
  86. (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
  87. (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
  88. ((txt)[(*(off))++]))
  89. #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
  90. yajl_lexer
  91. yajl_lex_alloc(yajl_alloc_funcs * alloc,
  92. unsigned int allowComments,
  93. unsigned int validateUTF8)
  94. {
  95. yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
  96. memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
  97. lxr->buf = yajl_buf_alloc(alloc);
  98. lxr->allowComments = allowComments;
  99. lxr->validateUTF8 = validateUTF8;
  100. lxr->alloc = alloc;
  101. return lxr;
  102. }
  103. void
  104. yajl_lex_free(yajl_lexer lxr)
  105. {
  106. yajl_buf_free(lxr->buf);
  107. YA_FREE(lxr->alloc, lxr);
  108. return;
  109. }
  110. /* a lookup table which lets us quickly determine three things:
  111. * VEC - valid escaped control char
  112. * note. the solidus '/' may be escaped or not.
  113. * IJC - invalid json char
  114. * VHC - valid hex char
  115. * NFP - needs further processing (from a string scanning perspective)
  116. * NUC - needs utf8 checking when enabled (from a string scanning perspective)
  117. */
  118. #define VEC 0x01
  119. #define IJC 0x02
  120. #define VHC 0x04
  121. #define NFP 0x08
  122. #define NUC 0x10
  123. static const char charLookupTable[256] =
  124. {
  125. /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
  126. /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
  127. /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
  128. /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
  129. /*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
  130. /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
  131. /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
  132. /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
  133. /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
  134. /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
  135. /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
  136. /*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
  137. /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
  138. /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
  139. /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
  140. /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
  141. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  142. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  143. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  144. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  145. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  146. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  147. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  148. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  149. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  150. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  151. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  152. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  153. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  154. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  155. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
  156. NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
  157. };
  158. /** process a variable length utf8 encoded codepoint.
  159. *
  160. * returns:
  161. * yajl_tok_string - if valid utf8 char was parsed and offset was
  162. * advanced
  163. * yajl_tok_eof - if end of input was hit before validation could
  164. * complete
  165. * yajl_tok_error - if invalid utf8 was encountered
  166. *
  167. * NOTE: on error the offset will point to the first char of the
  168. * invalid utf8 */
  169. #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
  170. static yajl_tok
  171. yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
  172. size_t jsonTextLen, size_t * offset,
  173. unsigned char curChar)
  174. {
  175. if (curChar <= 0x7f) {
  176. /* single byte */
  177. return yajl_tok_string;
  178. } else if ((curChar >> 5) == 0x6) {
  179. /* two byte */
  180. UTF8_CHECK_EOF;
  181. curChar = readChar(lexer, jsonText, offset);
  182. if ((curChar >> 6) == 0x2) return yajl_tok_string;
  183. } else if ((curChar >> 4) == 0x0e) {
  184. /* three byte */
  185. UTF8_CHECK_EOF;
  186. curChar = readChar(lexer, jsonText, offset);
  187. if ((curChar >> 6) == 0x2) {
  188. UTF8_CHECK_EOF;
  189. curChar = readChar(lexer, jsonText, offset);
  190. if ((curChar >> 6) == 0x2) return yajl_tok_string;
  191. }
  192. } else if ((curChar >> 3) == 0x1e) {
  193. /* four byte */
  194. UTF8_CHECK_EOF;
  195. curChar = readChar(lexer, jsonText, offset);
  196. if ((curChar >> 6) == 0x2) {
  197. UTF8_CHECK_EOF;
  198. curChar = readChar(lexer, jsonText, offset);
  199. if ((curChar >> 6) == 0x2) {
  200. UTF8_CHECK_EOF;
  201. curChar = readChar(lexer, jsonText, offset);
  202. if ((curChar >> 6) == 0x2) return yajl_tok_string;
  203. }
  204. }
  205. }
  206. return yajl_tok_error;
  207. }
  208. /* lex a string. input is the lexer, pointer to beginning of
  209. * json text, and start of string (offset).
  210. * a token is returned which has the following meanings:
  211. * yajl_tok_string: lex of string was successful. offset points to
  212. * terminating '"'.
  213. * yajl_tok_eof: end of text was encountered before we could complete
  214. * the lex.
  215. * yajl_tok_error: embedded in the string were unallowable chars. offset
  216. * points to the offending char
  217. */
  218. #define STR_CHECK_EOF \
  219. if (*offset >= jsonTextLen) { \
  220. tok = yajl_tok_eof; \
  221. goto finish_string_lex; \
  222. }
  223. /** scan a string for interesting characters that might need further
  224. * review. return the number of chars that are uninteresting and can
  225. * be skipped.
  226. * (lth) hi world, any thoughts on how to make this routine faster? */
  227. static size_t
  228. yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
  229. {
  230. unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
  231. size_t skip = 0;
  232. while (skip < len && !(charLookupTable[*buf] & mask))
  233. {
  234. skip++;
  235. buf++;
  236. }
  237. return skip;
  238. }
  239. static yajl_tok
  240. yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
  241. size_t jsonTextLen, size_t * offset)
  242. {
  243. yajl_tok tok = yajl_tok_error;
  244. int hasEscapes = 0;
  245. for (;;) {
  246. unsigned char curChar;
  247. /* now jump into a faster scanning routine to skip as much
  248. * of the buffers as possible */
  249. {
  250. const unsigned char * p;
  251. size_t len;
  252. if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
  253. lexer->bufOff < yajl_buf_len(lexer->buf)))
  254. {
  255. p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
  256. (lexer->bufOff));
  257. len = yajl_buf_len(lexer->buf) - lexer->bufOff;
  258. lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
  259. }
  260. else if (*offset < jsonTextLen)
  261. {
  262. p = jsonText + *offset;
  263. len = jsonTextLen - *offset;
  264. *offset += yajl_string_scan(p, len, lexer->validateUTF8);
  265. }
  266. }
  267. STR_CHECK_EOF;
  268. curChar = readChar(lexer, jsonText, offset);
  269. /* quote terminates */
  270. if (curChar == '"') {
  271. tok = yajl_tok_string;
  272. break;
  273. }
  274. /* backslash escapes a set of control chars, */
  275. else if (curChar == '\\') {
  276. hasEscapes = 1;
  277. STR_CHECK_EOF;
  278. /* special case \u */
  279. curChar = readChar(lexer, jsonText, offset);
  280. if (curChar == 'u') {
  281. unsigned int i = 0;
  282. for (i=0;i<4;i++) {
  283. STR_CHECK_EOF;
  284. curChar = readChar(lexer, jsonText, offset);
  285. if (!(charLookupTable[curChar] & VHC)) {
  286. /* back up to offending char */
  287. unreadChar(lexer, offset);
  288. lexer->error = yajl_lex_string_invalid_hex_char;
  289. goto finish_string_lex;
  290. }
  291. }
  292. } else if (!(charLookupTable[curChar] & VEC)) {
  293. /* back up to offending char */
  294. unreadChar(lexer, offset);
  295. lexer->error = yajl_lex_string_invalid_escaped_char;
  296. goto finish_string_lex;
  297. }
  298. }
  299. /* when not validating UTF8 it's a simple table lookup to determine
  300. * if the present character is invalid */
  301. else if(charLookupTable[curChar] & IJC) {
  302. /* back up to offending char */
  303. unreadChar(lexer, offset);
  304. lexer->error = yajl_lex_string_invalid_json_char;
  305. goto finish_string_lex;
  306. }
  307. /* when in validate UTF8 mode we need to do some extra work */
  308. else if (lexer->validateUTF8) {
  309. yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
  310. offset, curChar);
  311. if (t == yajl_tok_eof) {
  312. tok = yajl_tok_eof;
  313. goto finish_string_lex;
  314. } else if (t == yajl_tok_error) {
  315. lexer->error = yajl_lex_string_invalid_utf8;
  316. goto finish_string_lex;
  317. }
  318. }
  319. /* accept it, and move on */
  320. }
  321. finish_string_lex:
  322. /* tell our buddy, the parser, wether he needs to process this string
  323. * again */
  324. if (hasEscapes && tok == yajl_tok_string) {
  325. tok = yajl_tok_string_with_escapes;
  326. }
  327. return tok;
  328. }
  329. #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
  330. static yajl_tok
  331. yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
  332. size_t jsonTextLen, size_t * offset)
  333. {
  334. /** XXX: numbers are the only entities in json that we must lex
  335. * _beyond_ in order to know that they are complete. There
  336. * is an ambiguous case for integers at EOF. */
  337. unsigned char c;
  338. yajl_tok tok = yajl_tok_integer;
  339. RETURN_IF_EOF;
  340. c = readChar(lexer, jsonText, offset);
  341. /* optional leading minus */
  342. char minus = 0;
  343. if (c == '-') {
  344. minus = 1;
  345. RETURN_IF_EOF;
  346. c = readChar(lexer, jsonText, offset);
  347. }
  348. /* a single zero, or a series of integers */
  349. if (c == '0') {
  350. RETURN_IF_EOF;
  351. c = readChar(lexer, jsonText, offset);
  352. } else if (c >= '1' && c <= '9') {
  353. do {
  354. RETURN_IF_EOF;
  355. c = readChar(lexer, jsonText, offset);
  356. } while (c >= '0' && c <= '9');
  357. } else if (c == 'i') {
  358. if (readChar(lexer, jsonText, offset) != 'n') {
  359. unreadChar(lexer, offset);
  360. lexer->error = yajl_lex_invalid_infinity;
  361. return yajl_tok_error;
  362. }
  363. if (readChar(lexer, jsonText, offset) != 'f') {
  364. unreadChar(lexer, offset);
  365. lexer->error = yajl_lex_invalid_infinity;
  366. return yajl_tok_error;
  367. }
  368. if (minus) {
  369. return yajl_tok_minus_inf;
  370. } else {
  371. return yajl_tok_inf;
  372. }
  373. } else {
  374. unreadChar(lexer, offset);
  375. lexer->error = yajl_lex_missing_integer_after_minus;
  376. return yajl_tok_error;
  377. }
  378. /* optional fraction (indicates this is floating point) */
  379. if (c == '.') {
  380. int numRd = 0;
  381. RETURN_IF_EOF;
  382. c = readChar(lexer, jsonText, offset);
  383. while (c >= '0' && c <= '9') {
  384. numRd++;
  385. RETURN_IF_EOF;
  386. c = readChar(lexer, jsonText, offset);
  387. }
  388. if (!numRd) {
  389. unreadChar(lexer, offset);
  390. lexer->error = yajl_lex_missing_integer_after_decimal;
  391. return yajl_tok_error;
  392. }
  393. tok = yajl_tok_double;
  394. }
  395. /* optional exponent (indicates this is floating point) */
  396. if (c == 'e' || c == 'E') {
  397. RETURN_IF_EOF;
  398. c = readChar(lexer, jsonText, offset);
  399. /* optional sign */
  400. if (c == '+' || c == '-') {
  401. RETURN_IF_EOF;
  402. c = readChar(lexer, jsonText, offset);
  403. }
  404. if (c >= '0' && c <= '9') {
  405. do {
  406. RETURN_IF_EOF;
  407. c = readChar(lexer, jsonText, offset);
  408. } while (c >= '0' && c <= '9');
  409. } else {
  410. unreadChar(lexer, offset);
  411. lexer->error = yajl_lex_missing_integer_after_exponent;
  412. return yajl_tok_error;
  413. }
  414. tok = yajl_tok_double;
  415. }
  416. /* we always go "one too far" */
  417. unreadChar(lexer, offset);
  418. return tok;
  419. }
  420. static yajl_tok
  421. yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
  422. size_t jsonTextLen, size_t * offset)
  423. {
  424. unsigned char c;
  425. yajl_tok tok = yajl_tok_comment;
  426. RETURN_IF_EOF;
  427. c = readChar(lexer, jsonText, offset);
  428. /* either slash or star expected */
  429. if (c == '/') {
  430. /* now we throw away until end of line */
  431. do {
  432. RETURN_IF_EOF;
  433. c = readChar(lexer, jsonText, offset);
  434. } while (c != '\n');
  435. } else if (c == '*') {
  436. /* now we throw away until end of comment */
  437. for (;;) {
  438. RETURN_IF_EOF;
  439. c = readChar(lexer, jsonText, offset);
  440. if (c == '*') {
  441. RETURN_IF_EOF;
  442. c = readChar(lexer, jsonText, offset);
  443. if (c == '/') {
  444. break;
  445. } else {
  446. unreadChar(lexer, offset);
  447. }
  448. }
  449. }
  450. } else {
  451. lexer->error = yajl_lex_invalid_char;
  452. tok = yajl_tok_error;
  453. }
  454. return tok;
  455. }
  456. #define MATCH(want_value, target_token) \
  457. const char * want = want_value; \
  458. do { \
  459. if (*offset >= jsonTextLen) { \
  460. tok = yajl_tok_eof; \
  461. goto lexed; \
  462. } \
  463. c = readChar(lexer, jsonText, offset); \
  464. if (c != *want) { \
  465. unreadChar(lexer, offset); \
  466. lexer->error = yajl_lex_invalid_string; \
  467. tok = yajl_tok_error; \
  468. goto lexed; \
  469. } \
  470. } while (*(++want)); \
  471. tok = target_token; \
  472. goto lexed;
  473. yajl_tok
  474. yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
  475. size_t jsonTextLen, size_t * offset,
  476. const unsigned char ** outBuf, size_t * outLen)
  477. {
  478. yajl_tok tok = yajl_tok_error;
  479. unsigned char c;
  480. size_t startOffset = *offset;
  481. *outBuf = NULL;
  482. *outLen = 0;
  483. for (;;) {
  484. assert(*offset <= jsonTextLen);
  485. if (*offset >= jsonTextLen) {
  486. tok = yajl_tok_eof;
  487. goto lexed;
  488. }
  489. c = readChar(lexer, jsonText, offset);
  490. switch (c) {
  491. case '{':
  492. tok = yajl_tok_left_bracket;
  493. goto lexed;
  494. case '}':
  495. tok = yajl_tok_right_bracket;
  496. goto lexed;
  497. case '[':
  498. tok = yajl_tok_left_brace;
  499. goto lexed;
  500. case ']':
  501. tok = yajl_tok_right_brace;
  502. goto lexed;
  503. case ',':
  504. tok = yajl_tok_comma;
  505. goto lexed;
  506. case ':':
  507. tok = yajl_tok_colon;
  508. goto lexed;
  509. case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
  510. startOffset++;
  511. break;
  512. case 't': {
  513. MATCH("rue", yajl_tok_bool);
  514. }
  515. case 'f': {
  516. MATCH("alse", yajl_tok_bool);
  517. }
  518. case 'n': {
  519. MATCH("ull", yajl_tok_null);
  520. }
  521. case '"': {
  522. tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
  523. jsonTextLen, offset);
  524. goto lexed;
  525. }
  526. case '-':
  527. case 'i':
  528. case '0': case '1': case '2': case '3': case '4':
  529. case '5': case '6': case '7': case '8': case '9': {
  530. /* integer parsing wants to start from the beginning */
  531. unreadChar(lexer, offset);
  532. tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
  533. jsonTextLen, offset);
  534. goto lexed;
  535. }
  536. case '/':
  537. /* hey, look, a probable comment! If comments are disabled
  538. * it's an error. */
  539. if (!lexer->allowComments) {
  540. unreadChar(lexer, offset);
  541. lexer->error = yajl_lex_unallowed_comment;
  542. tok = yajl_tok_error;
  543. goto lexed;
  544. }
  545. /* if comments are enabled, then we should try to lex
  546. * the thing. possible outcomes are
  547. * - successful lex (tok_comment, which means continue),
  548. * - malformed comment opening (slash not followed by
  549. * '*' or '/') (tok_error)
  550. * - eof hit. (tok_eof) */
  551. tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
  552. jsonTextLen, offset);
  553. if (tok == yajl_tok_comment) {
  554. /* "error" is silly, but that's the initial
  555. * state of tok. guilty until proven innocent. */
  556. tok = yajl_tok_error;
  557. yajl_buf_clear(lexer->buf);
  558. lexer->bufInUse = 0;
  559. startOffset = *offset;
  560. break;
  561. }
  562. /* hit error or eof, bail */
  563. goto lexed;
  564. default:
  565. lexer->error = yajl_lex_invalid_char;
  566. tok = yajl_tok_error;
  567. goto lexed;
  568. }
  569. }
  570. lexed:
  571. /* need to append to buffer if the buffer is in use or
  572. * if it's an EOF token */
  573. if (tok == yajl_tok_eof || lexer->bufInUse) {
  574. if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
  575. lexer->bufInUse = 1;
  576. yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
  577. lexer->bufOff = 0;
  578. if (tok != yajl_tok_eof) {
  579. *outBuf = yajl_buf_data(lexer->buf);
  580. *outLen = yajl_buf_len(lexer->buf);
  581. lexer->bufInUse = 0;
  582. }
  583. } else if (tok != yajl_tok_error) {
  584. *outBuf = jsonText + startOffset;
  585. *outLen = *offset - startOffset;
  586. }
  587. /* special case for strings. skip the quotes. */
  588. if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
  589. {
  590. assert(*outLen >= 2);
  591. (*outBuf)++;
  592. *outLen -= 2;
  593. }
  594. #ifdef YAJL_LEXER_DEBUG
  595. if (tok == yajl_tok_error) {
  596. printf("lexical error: %s\n",
  597. yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
  598. } else if (tok == yajl_tok_eof) {
  599. printf("EOF hit\n");
  600. } else {
  601. printf("lexed %s: '", tokToStr(tok));
  602. fwrite(*outBuf, 1, *outLen, stdout);
  603. printf("'\n");
  604. }
  605. #endif
  606. return tok;
  607. }
  608. const char *
  609. yajl_lex_error_to_string(yajl_lex_error error)
  610. {
  611. switch (error) {
  612. case yajl_lex_e_ok:
  613. return "ok, no error";
  614. case yajl_lex_string_invalid_utf8:
  615. return "invalid bytes in UTF8 string.";
  616. case yajl_lex_string_invalid_escaped_char:
  617. return "inside a string, '\\' occurs before a character "
  618. "which it may not.";
  619. case yajl_lex_string_invalid_json_char:
  620. return "invalid character inside string.";
  621. case yajl_lex_string_invalid_hex_char:
  622. return "invalid (non-hex) character occurs after '\\u' inside "
  623. "string.";
  624. case yajl_lex_invalid_char:
  625. return "invalid char in json text.";
  626. case yajl_lex_invalid_string:
  627. return "invalid string in json text.";
  628. case yajl_lex_missing_integer_after_exponent:
  629. return "malformed number, a digit is required after the exponent.";
  630. case yajl_lex_missing_integer_after_decimal:
  631. return "malformed number, a digit is required after the "
  632. "decimal point.";
  633. case yajl_lex_missing_integer_after_minus:
  634. return "malformed number, a digit is required after the "
  635. "minus sign.";
  636. case yajl_lex_invalid_infinity:
  637. return "malformed number, a token inf required for number starting "
  638. "from 'i'";
  639. case yajl_lex_unallowed_comment:
  640. return "probable comment found in input text, comments are "
  641. "not enabled.";
  642. }
  643. return "unknown error code";
  644. }
  645. /** allows access to more specific information about the lexical
  646. * error when yajl_lex_lex returns yajl_tok_error. */
  647. yajl_lex_error
  648. yajl_lex_get_error(yajl_lexer lexer)
  649. {
  650. if (lexer == NULL) return (yajl_lex_error) -1;
  651. return lexer->error;
  652. }
  653. size_t yajl_lex_current_line(yajl_lexer lexer)
  654. {
  655. return lexer->lineOff;
  656. }
  657. size_t yajl_lex_current_char(yajl_lexer lexer)
  658. {
  659. return lexer->charOff;
  660. }
  661. yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
  662. size_t jsonTextLen, size_t offset)
  663. {
  664. const unsigned char * outBuf;
  665. size_t outLen;
  666. size_t bufLen = yajl_buf_len(lexer->buf);
  667. size_t bufOff = lexer->bufOff;
  668. unsigned int bufInUse = lexer->bufInUse;
  669. yajl_tok tok;
  670. tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
  671. &outBuf, &outLen);
  672. lexer->bufOff = bufOff;
  673. lexer->bufInUse = bufInUse;
  674. yajl_buf_truncate(lexer->buf, bufLen);
  675. return tok;
  676. }
  677. size_t yajl_lex_buf_capacity(yajl_lexer lexer)
  678. {
  679. return yajl_buf_capacity(lexer->buf);
  680. }