log2journal-json.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "log2journal.h"
  3. #define JSON_ERROR_LINE_MAX 1024
  4. #define JSON_KEY_MAX 1024
  5. #define JSON_DEPTH_MAX 100
  6. struct log_json_state {
  7. LOG_JOB *jb;
  8. const char *line;
  9. uint32_t pos;
  10. uint32_t depth;
  11. char *stack[JSON_DEPTH_MAX];
  12. char key[JSON_KEY_MAX];
  13. char msg[JSON_ERROR_LINE_MAX];
  14. };
  15. static inline bool json_parse_object(LOG_JSON_STATE *js);
  16. static inline bool json_parse_array(LOG_JSON_STATE *js);
  17. #define json_current_pos(js) &(js)->line[(js)->pos]
  18. #define json_consume_char(js) ++(js)->pos
  19. static inline void json_process_key_value(LOG_JSON_STATE *js, const char *value, size_t len) {
  20. log_job_send_extracted_key_value(js->jb, js->key, value, len);
  21. }
  22. static inline void json_skip_spaces(LOG_JSON_STATE *js) {
  23. const char *s = json_current_pos(js);
  24. const char *start = s;
  25. while(isspace(*s)) s++;
  26. js->pos += s - start;
  27. }
  28. static inline bool json_expect_char_after_white_space(LOG_JSON_STATE *js, const char *expected) {
  29. json_skip_spaces(js);
  30. const char *s = json_current_pos(js);
  31. for(const char *e = expected; *e ;e++) {
  32. if (*s == *e)
  33. return true;
  34. }
  35. snprintf(js->msg, sizeof(js->msg),
  36. "JSON PARSER: character '%c' is not one of the expected characters (%s), at pos %zu",
  37. *s ? *s : '?', expected, js->pos);
  38. return false;
  39. }
  40. static inline bool json_parse_null(LOG_JSON_STATE *js) {
  41. const char *s = json_current_pos(js);
  42. if (strncmp(s, "null", 4) == 0) {
  43. json_process_key_value(js, "null", 4);
  44. js->pos += 4;
  45. return true;
  46. }
  47. else {
  48. snprintf(js->msg, sizeof(js->msg),
  49. "JSON PARSER: expected 'null', found '%.4s' at position %zu", s, js->pos);
  50. return false;
  51. }
  52. }
  53. static inline bool json_parse_true(LOG_JSON_STATE *js) {
  54. const char *s = json_current_pos(js);
  55. if (strncmp(s, "true", 4) == 0) {
  56. json_process_key_value(js, "true", 4);
  57. js->pos += 4;
  58. return true;
  59. }
  60. else {
  61. snprintf(js->msg, sizeof(js->msg),
  62. "JSON PARSER: expected 'true', found '%.4s' at position %zu", s, js->pos);
  63. return false;
  64. }
  65. }
  66. static inline bool json_parse_false(LOG_JSON_STATE *js) {
  67. const char *s = json_current_pos(js);
  68. if (strncmp(s, "false", 5) == 0) {
  69. json_process_key_value(js, "false", 5);
  70. js->pos += 5;
  71. return true;
  72. }
  73. else {
  74. snprintf(js->msg, sizeof(js->msg),
  75. "JSON PARSER: expected 'false', found '%.4s' at position %zu", s, js->pos);
  76. return false;
  77. }
  78. }
  79. static inline bool json_parse_number(LOG_JSON_STATE *js) {
  80. static __thread char value[8192];
  81. value[0] = '\0';
  82. char *d = value;
  83. const char *s = json_current_pos(js);
  84. size_t remaining = sizeof(value) - 1; // Reserve space for null terminator
  85. // Optional minus sign
  86. if (*s == '-') {
  87. *d++ = *s++;
  88. remaining--;
  89. }
  90. // Digits before decimal point
  91. while (*s >= '0' && *s <= '9') {
  92. if (remaining < 2) {
  93. snprintf(js->msg, sizeof(js->msg), "JSON PARSER: truncated number value at pos %zu", js->pos);
  94. return false;
  95. }
  96. *d++ = *s++;
  97. remaining--;
  98. }
  99. // Decimal point and fractional part
  100. if (*s == '.') {
  101. *d++ = *s++;
  102. remaining--;
  103. while (*s >= '0' && *s <= '9') {
  104. if (remaining < 2) {
  105. snprintf(js->msg, sizeof(js->msg), "JSON PARSER: truncated fractional part at pos %zu", js->pos);
  106. return false;
  107. }
  108. *d++ = *s++;
  109. remaining--;
  110. }
  111. }
  112. // Exponent part
  113. if (*s == 'e' || *s == 'E') {
  114. *d++ = *s++;
  115. remaining--;
  116. // Optional sign in exponent
  117. if (*s == '+' || *s == '-') {
  118. *d++ = *s++;
  119. remaining--;
  120. }
  121. while (*s >= '0' && *s <= '9') {
  122. if (remaining < 2) {
  123. snprintf(js->msg, sizeof(js->msg), "JSON PARSER: truncated exponent at pos %zu", js->pos);
  124. return false;
  125. }
  126. *d++ = *s++;
  127. remaining--;
  128. }
  129. }
  130. *d = '\0';
  131. js->pos += d - value;
  132. if (d > value) {
  133. json_process_key_value(js, value, d - value);
  134. return true;
  135. } else {
  136. snprintf(js->msg, sizeof(js->msg), "JSON PARSER: invalid number format at pos %zu", js->pos);
  137. return false;
  138. }
  139. }
  140. static inline bool encode_utf8(unsigned codepoint, char **d, size_t *remaining) {
  141. if (codepoint <= 0x7F) {
  142. // 1-byte sequence
  143. if (*remaining < 2) return false; // +1 for the null
  144. *(*d)++ = (char)codepoint;
  145. (*remaining)--;
  146. }
  147. else if (codepoint <= 0x7FF) {
  148. // 2-byte sequence
  149. if (*remaining < 3) return false; // +1 for the null
  150. *(*d)++ = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
  151. *(*d)++ = (char)(0x80 | (codepoint & 0x3F));
  152. (*remaining) -= 2;
  153. }
  154. else if (codepoint <= 0xFFFF) {
  155. // 3-byte sequence
  156. if (*remaining < 4) return false; // +1 for the null
  157. *(*d)++ = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
  158. *(*d)++ = (char)(0x80 | ((codepoint >> 6) & 0x3F));
  159. *(*d)++ = (char)(0x80 | (codepoint & 0x3F));
  160. (*remaining) -= 3;
  161. }
  162. else if (codepoint <= 0x10FFFF) {
  163. // 4-byte sequence
  164. if (*remaining < 5) return false; // +1 for the null
  165. *(*d)++ = (char)(0xF0 | ((codepoint >> 18) & 0x07));
  166. *(*d)++ = (char)(0x80 | ((codepoint >> 12) & 0x3F));
  167. *(*d)++ = (char)(0x80 | ((codepoint >> 6) & 0x3F));
  168. *(*d)++ = (char)(0x80 | (codepoint & 0x3F));
  169. (*remaining) -= 4;
  170. }
  171. else
  172. // Invalid code point
  173. return false;
  174. return true;
  175. }
  176. size_t parse_surrogate(const char *s, char *d, size_t *remaining) {
  177. if (s[0] != '\\' || (s[1] != 'u' && s[1] != 'U')) {
  178. return 0; // Not a valid Unicode escape sequence
  179. }
  180. char hex[9] = {0}; // Buffer for the hexadecimal value
  181. unsigned codepoint;
  182. if (s[1] == 'u') {
  183. // Handle \uXXXX
  184. if (!isxdigit(s[2]) || !isxdigit(s[3]) || !isxdigit(s[4]) || !isxdigit(s[5])) {
  185. return 0; // Not a valid \uXXXX sequence
  186. }
  187. hex[0] = s[2];
  188. hex[1] = s[3];
  189. hex[2] = s[4];
  190. hex[3] = s[5];
  191. codepoint = (unsigned)strtoul(hex, NULL, 16);
  192. if (codepoint >= 0xD800 && codepoint <= 0xDBFF) {
  193. // Possible start of surrogate pair
  194. if (s[6] == '\\' && s[7] == 'u' && isxdigit(s[8]) && isxdigit(s[9]) &&
  195. isxdigit(s[10]) && isxdigit(s[11])) {
  196. // Valid low surrogate
  197. unsigned low_surrogate = strtoul(&s[8], NULL, 16);
  198. if (low_surrogate < 0xDC00 || low_surrogate > 0xDFFF) {
  199. return 0; // Invalid low surrogate
  200. }
  201. codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low_surrogate - 0xDC00);
  202. return encode_utf8(codepoint, &d, remaining) ? 12 : 0; // \uXXXX\uXXXX
  203. }
  204. }
  205. // Single \uXXXX
  206. return encode_utf8(codepoint, &d, remaining) ? 6 : 0;
  207. }
  208. else {
  209. // Handle \UXXXXXXXX
  210. for (int i = 2; i < 10; i++) {
  211. if (!isxdigit(s[i])) {
  212. return 0; // Not a valid \UXXXXXXXX sequence
  213. }
  214. hex[i - 2] = s[i];
  215. }
  216. codepoint = (unsigned)strtoul(hex, NULL, 16);
  217. return encode_utf8(codepoint, &d, remaining) ? 10 : 0; // \UXXXXXXXX
  218. }
  219. }
  220. static inline void copy_newline(LOG_JSON_STATE *js __maybe_unused, char **d, size_t *remaining) {
  221. if(*remaining > 3) {
  222. *(*d)++ = '\\';
  223. *(*d)++ = 'n';
  224. (*remaining) -= 2;
  225. }
  226. }
  227. static inline void copy_tab(LOG_JSON_STATE *js __maybe_unused, char **d, size_t *remaining) {
  228. if(*remaining > 3) {
  229. *(*d)++ = '\\';
  230. *(*d)++ = 't';
  231. (*remaining) -= 2;
  232. }
  233. }
  234. static inline bool json_parse_string(LOG_JSON_STATE *js) {
  235. static __thread char value[JOURNAL_MAX_VALUE_LEN];
  236. if(!json_expect_char_after_white_space(js, "\""))
  237. return false;
  238. json_consume_char(js);
  239. value[0] = '\0';
  240. char *d = value;
  241. const char *s = json_current_pos(js);
  242. size_t remaining = sizeof(value);
  243. while (*s && *s != '"') {
  244. char c;
  245. if (*s == '\\') {
  246. s++;
  247. switch (*s) {
  248. case 'n':
  249. copy_newline(js, &d, &remaining);
  250. s++;
  251. continue;
  252. case 't':
  253. copy_tab(js, &d, &remaining);
  254. s++;
  255. continue;
  256. case 'f':
  257. case 'b':
  258. case 'r':
  259. c = ' ';
  260. s++;
  261. break;
  262. case 'u': {
  263. size_t old_remaining = remaining;
  264. size_t consumed = parse_surrogate(s - 1, d, &remaining);
  265. if (consumed > 0) {
  266. s += consumed - 1; // -1 because we already incremented s after '\\'
  267. d += old_remaining - remaining;
  268. continue;
  269. }
  270. else {
  271. *d++ = '\\';
  272. remaining--;
  273. c = *s++;
  274. }
  275. }
  276. break;
  277. default:
  278. c = *s++;
  279. break;
  280. }
  281. }
  282. else
  283. c = *s++;
  284. if(remaining < 2) {
  285. snprintf(js->msg, sizeof(js->msg),
  286. "JSON PARSER: truncated string value at pos %zu", js->pos);
  287. return false;
  288. }
  289. else {
  290. *d++ = c;
  291. remaining--;
  292. }
  293. }
  294. *d = '\0';
  295. js->pos += s - json_current_pos(js);
  296. if(!json_expect_char_after_white_space(js, "\""))
  297. return false;
  298. json_consume_char(js);
  299. if(d > value)
  300. json_process_key_value(js, value, d - value);
  301. return true;
  302. }
  303. static inline bool json_parse_key_and_push(LOG_JSON_STATE *js) {
  304. if (!json_expect_char_after_white_space(js, "\""))
  305. return false;
  306. if(js->depth >= JSON_DEPTH_MAX - 1) {
  307. snprintf(js->msg, sizeof(js->msg),
  308. "JSON PARSER: object too deep, at pos %zu", js->pos);
  309. return false;
  310. }
  311. json_consume_char(js);
  312. char *d = js->stack[js->depth];
  313. if(js->depth)
  314. *d++ = '_';
  315. size_t remaining = sizeof(js->key) - (d - js->key);
  316. const char *s = json_current_pos(js);
  317. char last_c = '\0';
  318. while(*s && *s != '\"') {
  319. char c;
  320. if (*s == '\\') {
  321. s++;
  322. c = (char)((*s == 'u') ? '_' : journal_key_characters_map[(unsigned char)*s]);
  323. s += (*s == 'u') ? 5 : 1;
  324. }
  325. else
  326. c = journal_key_characters_map[(unsigned char)*s++];
  327. if(c == '_' && last_c == '_')
  328. continue;
  329. else {
  330. if(remaining < 2) {
  331. snprintf(js->msg, sizeof(js->msg),
  332. "JSON PARSER: key buffer full - keys are too long, at pos %zu", js->pos);
  333. return false;
  334. }
  335. *d++ = c;
  336. remaining--;
  337. }
  338. last_c = c;
  339. }
  340. *d = '\0';
  341. js->pos += s - json_current_pos(js);
  342. if (!json_expect_char_after_white_space(js, "\""))
  343. return false;
  344. json_consume_char(js);
  345. js->stack[++js->depth] = d;
  346. return true;
  347. }
  348. static inline bool json_key_pop(LOG_JSON_STATE *js) {
  349. if(js->depth <= 0) {
  350. snprintf(js->msg, sizeof(js->msg),
  351. "JSON PARSER: cannot pop a key at depth %zu, at pos %zu", js->depth, js->pos);
  352. return false;
  353. }
  354. char *k = js->stack[js->depth--];
  355. *k = '\0';
  356. return true;
  357. }
  358. static inline bool json_parse_value(LOG_JSON_STATE *js) {
  359. if(!json_expect_char_after_white_space(js, "-.0123456789tfn\"{["))
  360. return false;
  361. const char *s = json_current_pos(js);
  362. switch(*s) {
  363. case '-':
  364. case '0':
  365. case '1':
  366. case '2':
  367. case '3':
  368. case '4':
  369. case '5':
  370. case '6':
  371. case '7':
  372. case '8':
  373. case '9':
  374. return json_parse_number(js);
  375. case 't':
  376. return json_parse_true(js);
  377. case 'f':
  378. return json_parse_false(js);
  379. case 'n':
  380. return json_parse_null(js);
  381. case '"':
  382. return json_parse_string(js);
  383. case '{':
  384. return json_parse_object(js);
  385. case '[':
  386. return json_parse_array(js);
  387. }
  388. snprintf(js->msg, sizeof(js->msg),
  389. "JSON PARSER: unexpected character at pos %zu", js->pos);
  390. return false;
  391. }
  392. static inline bool json_key_index_and_push(LOG_JSON_STATE *js, size_t index) {
  393. char *d = js->stack[js->depth];
  394. if(js->depth > 0) {
  395. *d++ = '_';
  396. }
  397. // Convert index to string manually
  398. char temp[32];
  399. char *t = temp + sizeof(temp) - 1; // Start at the end of the buffer
  400. *t = '\0';
  401. do {
  402. *--t = (char)((index % 10) + '0');
  403. index /= 10;
  404. } while (index > 0);
  405. size_t remaining = sizeof(js->key) - (d - js->key);
  406. // Append the index to the key
  407. while (*t) {
  408. if(remaining < 2) {
  409. snprintf(js->msg, sizeof(js->msg),
  410. "JSON PARSER: key buffer full - keys are too long, at pos %zu", js->pos);
  411. return false;
  412. }
  413. *d++ = *t++;
  414. remaining--;
  415. }
  416. *d = '\0'; // Null-terminate the key
  417. js->stack[++js->depth] = d;
  418. return true;
  419. }
  420. static inline bool json_parse_array(LOG_JSON_STATE *js) {
  421. if(!json_expect_char_after_white_space(js, "["))
  422. return false;
  423. json_consume_char(js);
  424. size_t index = 0;
  425. do {
  426. if(!json_key_index_and_push(js, index))
  427. return false;
  428. if(!json_parse_value(js))
  429. return false;
  430. json_key_pop(js);
  431. if(!json_expect_char_after_white_space(js, ",]"))
  432. return false;
  433. const char *s = json_current_pos(js);
  434. json_consume_char(js);
  435. if(*s == ',') {
  436. index++;
  437. continue;
  438. }
  439. else // }
  440. break;
  441. } while(true);
  442. return true;
  443. }
  444. static inline bool json_parse_object(LOG_JSON_STATE *js) {
  445. if(!json_expect_char_after_white_space(js, "{"))
  446. return false;
  447. json_consume_char(js);
  448. do {
  449. if (!json_expect_char_after_white_space(js, "\""))
  450. return false;
  451. if(!json_parse_key_and_push(js))
  452. return false;
  453. if(!json_expect_char_after_white_space(js, ":"))
  454. return false;
  455. json_consume_char(js);
  456. if(!json_parse_value(js))
  457. return false;
  458. json_key_pop(js);
  459. if(!json_expect_char_after_white_space(js, ",}"))
  460. return false;
  461. const char *s = json_current_pos(js);
  462. json_consume_char(js);
  463. if(*s == ',')
  464. continue;
  465. else // }
  466. break;
  467. } while(true);
  468. return true;
  469. }
  470. LOG_JSON_STATE *json_parser_create(LOG_JOB *jb) {
  471. LOG_JSON_STATE *js = mallocz(sizeof(LOG_JSON_STATE));
  472. memset(js, 0, sizeof(LOG_JSON_STATE));
  473. js->jb = jb;
  474. if(jb->prefix)
  475. copy_to_buffer(js->key, sizeof(js->key), js->jb->prefix, strlen(js->jb->prefix));
  476. js->stack[0] = &js->key[strlen(js->key)];
  477. return js;
  478. }
  479. void json_parser_destroy(LOG_JSON_STATE *js) {
  480. if(js)
  481. freez(js);
  482. }
  483. const char *json_parser_error(LOG_JSON_STATE *js) {
  484. return js->msg;
  485. }
  486. bool json_parse_document(LOG_JSON_STATE *js, const char *txt) {
  487. js->line = txt;
  488. js->pos = 0;
  489. js->msg[0] = '\0';
  490. js->stack[0][0] = '\0';
  491. js->depth = 0;
  492. if(!json_parse_object(js))
  493. return false;
  494. json_skip_spaces(js);
  495. const char *s = json_current_pos(js);
  496. if(*s) {
  497. snprintf(js->msg, sizeof(js->msg),
  498. "JSON PARSER: excess characters found after document is finished, at pos %zu", js->pos);
  499. return false;
  500. }
  501. return true;
  502. }
  503. void json_test(void) {
  504. LOG_JOB jb = { .prefix = "NIGNX_" };
  505. LOG_JSON_STATE *json = json_parser_create(&jb);
  506. json_parse_document(json, "{\"value\":\"\\u\\u039A\\u03B1\\u03BB\\u03B7\\u03BC\\u03AD\\u03C1\\u03B1\"}");
  507. json_parser_destroy(json);
  508. }