/* * sfparse * * Copyright (c) 2023 sfparse contributors * Copyright (c) 2019 nghttp3 contributors * Copyright (c) 2015 nghttp2 contributors * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "sfparse.h" #include #include #include #ifdef __AVX2__ # include #endif /* __AVX2__ */ #define SFPARSE_STATE_DICT 0x08u #define SFPARSE_STATE_LIST 0x10u #define SFPARSE_STATE_ITEM 0x18u #define SFPARSE_STATE_INNER_LIST 0x04u #define SFPARSE_STATE_BEFORE 0x00u #define SFPARSE_STATE_BEFORE_PARAMS 0x01u #define SFPARSE_STATE_PARAMS 0x02u #define SFPARSE_STATE_AFTER 0x03u #define SFPARSE_STATE_OP_MASK 0x03u #define SFPARSE_SET_STATE_AFTER(NAME) \ (SFPARSE_STATE_##NAME | SFPARSE_STATE_AFTER) #define SFPARSE_SET_STATE_BEFORE_PARAMS(NAME) \ (SFPARSE_STATE_##NAME | SFPARSE_STATE_BEFORE_PARAMS) #define SFPARSE_SET_STATE_INNER_LIST_BEFORE(NAME) \ (SFPARSE_STATE_##NAME | SFPARSE_STATE_INNER_LIST | SFPARSE_STATE_BEFORE) #define SFPARSE_STATE_DICT_AFTER SFPARSE_SET_STATE_AFTER(DICT) #define SFPARSE_STATE_DICT_BEFORE_PARAMS SFPARSE_SET_STATE_BEFORE_PARAMS(DICT) #define SFPARSE_STATE_DICT_INNER_LIST_BEFORE \ SFPARSE_SET_STATE_INNER_LIST_BEFORE(DICT) #define SFPARSE_STATE_LIST_AFTER SFPARSE_SET_STATE_AFTER(LIST) #define SFPARSE_STATE_LIST_BEFORE_PARAMS SFPARSE_SET_STATE_BEFORE_PARAMS(LIST) #define SFPARSE_STATE_LIST_INNER_LIST_BEFORE \ SFPARSE_SET_STATE_INNER_LIST_BEFORE(LIST) #define SFPARSE_STATE_ITEM_AFTER SFPARSE_SET_STATE_AFTER(ITEM) #define SFPARSE_STATE_ITEM_BEFORE_PARAMS SFPARSE_SET_STATE_BEFORE_PARAMS(ITEM) #define SFPARSE_STATE_ITEM_INNER_LIST_BEFORE \ SFPARSE_SET_STATE_INNER_LIST_BEFORE(ITEM) #define SFPARSE_STATE_INITIAL 0x00u #define DIGIT_CASES \ case '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9' #define LCALPHA_CASES \ case 'a': \ case 'b': \ case 'c': \ case 'd': \ case 'e': \ case 'f': \ case 'g': \ case 'h': \ case 'i': \ case 'j': \ case 'k': \ case 'l': \ case 'm': \ case 'n': \ case 'o': \ case 'p': \ case 'q': \ case 'r': \ case 's': \ case 't': \ case 'u': \ case 'v': \ case 'w': \ case 'x': \ case 'y': \ case 'z' #define UCALPHA_CASES \ case 'A': \ case 'B': \ case 'C': \ case 'D': \ case 'E': \ case 'F': \ case 'G': \ case 'H': \ case 'I': \ case 'J': \ case 'K': \ case 'L': \ case 'M': \ case 'N': \ case 'O': \ case 'P': \ case 'Q': \ case 'R': \ case 'S': \ case 'T': \ case 'U': \ case 'V': \ case 'W': \ case 'X': \ case 'Y': \ case 'Z' #define ALPHA_CASES \ UCALPHA_CASES: \ LCALPHA_CASES #define TOKEN_CASES \ case '!': \ case '#': \ case '$': \ case '%': \ case '&': \ case '\'': \ case '*': \ case '+': \ case '-': \ case '.': \ case '/': \ DIGIT_CASES: \ case ':': \ UCALPHA_CASES: \ case '^': \ case '_': \ case '`': \ LCALPHA_CASES: \ case '|': \ case '~' #define LCHEXALPHA_CASES \ case 'a': \ case 'b': \ case 'c': \ case 'd': \ case 'e': \ case 'f' #define X00_1F_CASES \ case 0x00: \ case 0x01: \ case 0x02: \ case 0x03: \ case 0x04: \ case 0x05: \ case 0x06: \ case 0x07: \ case 0x08: \ case 0x09: \ case 0x0a: \ case 0x0b: \ case 0x0c: \ case 0x0d: \ case 0x0e: \ case 0x0f: \ case 0x10: \ case 0x11: \ case 0x12: \ case 0x13: \ case 0x14: \ case 0x15: \ case 0x16: \ case 0x17: \ case 0x18: \ case 0x19: \ case 0x1a: \ case 0x1b: \ case 0x1c: \ case 0x1d: \ case 0x1e: \ case 0x1f #define X20_21_CASES \ case ' ': \ case '!' #define X23_5B_CASES \ case '#': \ case '$': \ case '%': \ case '&': \ case '\'': \ case '(': \ case ')': \ case '*': \ case '+': \ case ',': \ case '-': \ case '.': \ case '/': \ DIGIT_CASES: \ case ':': \ case ';': \ case '<': \ case '=': \ case '>': \ case '?': \ case '@': \ UCALPHA_CASES: \ case '[' #define X5D_7E_CASES \ case ']': \ case '^': \ case '_': \ case '`': \ LCALPHA_CASES: \ case '{': \ case '|': \ case '}': \ case '~' #define X7F_FF_CASES \ case 0x7f: \ case 0x80: \ case 0x81: \ case 0x82: \ case 0x83: \ case 0x84: \ case 0x85: \ case 0x86: \ case 0x87: \ case 0x88: \ case 0x89: \ case 0x8a: \ case 0x8b: \ case 0x8c: \ case 0x8d: \ case 0x8e: \ case 0x8f: \ case 0x90: \ case 0x91: \ case 0x92: \ case 0x93: \ case 0x94: \ case 0x95: \ case 0x96: \ case 0x97: \ case 0x98: \ case 0x99: \ case 0x9a: \ case 0x9b: \ case 0x9c: \ case 0x9d: \ case 0x9e: \ case 0x9f: \ case 0xa0: \ case 0xa1: \ case 0xa2: \ case 0xa3: \ case 0xa4: \ case 0xa5: \ case 0xa6: \ case 0xa7: \ case 0xa8: \ case 0xa9: \ case 0xaa: \ case 0xab: \ case 0xac: \ case 0xad: \ case 0xae: \ case 0xaf: \ case 0xb0: \ case 0xb1: \ case 0xb2: \ case 0xb3: \ case 0xb4: \ case 0xb5: \ case 0xb6: \ case 0xb7: \ case 0xb8: \ case 0xb9: \ case 0xba: \ case 0xbb: \ case 0xbc: \ case 0xbd: \ case 0xbe: \ case 0xbf: \ case 0xc0: \ case 0xc1: \ case 0xc2: \ case 0xc3: \ case 0xc4: \ case 0xc5: \ case 0xc6: \ case 0xc7: \ case 0xc8: \ case 0xc9: \ case 0xca: \ case 0xcb: \ case 0xcc: \ case 0xcd: \ case 0xce: \ case 0xcf: \ case 0xd0: \ case 0xd1: \ case 0xd2: \ case 0xd3: \ case 0xd4: \ case 0xd5: \ case 0xd6: \ case 0xd7: \ case 0xd8: \ case 0xd9: \ case 0xda: \ case 0xdb: \ case 0xdc: \ case 0xdd: \ case 0xde: \ case 0xdf: \ case 0xe0: \ case 0xe1: \ case 0xe2: \ case 0xe3: \ case 0xe4: \ case 0xe5: \ case 0xe6: \ case 0xe7: \ case 0xe8: \ case 0xe9: \ case 0xea: \ case 0xeb: \ case 0xec: \ case 0xed: \ case 0xee: \ case 0xef: \ case 0xf0: \ case 0xf1: \ case 0xf2: \ case 0xf3: \ case 0xf4: \ case 0xf5: \ case 0xf6: \ case 0xf7: \ case 0xf8: \ case 0xf9: \ case 0xfa: \ case 0xfb: \ case 0xfc: \ case 0xfd: \ case 0xfe: \ case 0xff static int is_ws(uint8_t c) { switch (c) { case ' ': case '\t': return 1; default: return 0; } } #ifdef __AVX2__ # ifdef _MSC_VER # include static int ctz(unsigned int v) { unsigned long n; /* Assume that v is not 0. */ _BitScanForward(&n, v); return (int)n; } # else /* !_MSC_VER */ # define ctz __builtin_ctz # endif /* !_MSC_VER */ #endif /* __AVX2__ */ static int parser_eof(sfparse_parser *sfp) { return sfp->pos == sfp->end; } static void parser_discard_ows(sfparse_parser *sfp) { for (; !parser_eof(sfp) && is_ws(*sfp->pos); ++sfp->pos) ; } static void parser_discard_sp(sfparse_parser *sfp) { for (; !parser_eof(sfp) && *sfp->pos == ' '; ++sfp->pos) ; } static void parser_set_op_state(sfparse_parser *sfp, uint32_t op) { sfp->state &= ~SFPARSE_STATE_OP_MASK; sfp->state |= op; } static void parser_unset_inner_list_state(sfparse_parser *sfp) { sfp->state &= ~SFPARSE_STATE_INNER_LIST; } #ifdef __AVX2__ static const uint8_t *find_char_key(const uint8_t *first, const uint8_t *last) { const __m256i us = _mm256_set1_epi8('_'); const __m256i ds = _mm256_set1_epi8('-'); const __m256i dot = _mm256_set1_epi8('.'); const __m256i ast = _mm256_set1_epi8('*'); const __m256i r0l = _mm256_set1_epi8('0' - 1); const __m256i r0r = _mm256_set1_epi8('9' + 1); const __m256i r1l = _mm256_set1_epi8('a' - 1); const __m256i r1r = _mm256_set1_epi8('z' + 1); __m256i s, x; uint32_t m; for (; first != last; first += 32) { s = _mm256_loadu_si256((void *)first); x = _mm256_cmpeq_epi8(s, us); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, ds), x); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, dot), x); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, ast), x); x = _mm256_or_si256( _mm256_and_si256(_mm256_cmpgt_epi8(s, r0l), _mm256_cmpgt_epi8(r0r, s)), x); x = _mm256_or_si256( _mm256_and_si256(_mm256_cmpgt_epi8(s, r1l), _mm256_cmpgt_epi8(r1r, s)), x); m = ~(uint32_t)_mm256_movemask_epi8(x); if (m) { return first + ctz(m); } } return last; } #endif /* __AVX2__ */ static int parser_key(sfparse_parser *sfp, sfparse_vec *dest) { const uint8_t *base; #ifdef __AVX2__ const uint8_t *last; #endif /* __AVX2__ */ switch (*sfp->pos) { case '*': LCALPHA_CASES: break; default: return SFPARSE_ERR_PARSE; } base = sfp->pos++; #ifdef __AVX2__ if (sfp->end - sfp->pos >= 32) { last = sfp->pos + ((sfp->end - sfp->pos) & ~0x1fu); sfp->pos = find_char_key(sfp->pos, last); if (sfp->pos != last) { goto fin; } } #endif /* __AVX2__ */ for (; !parser_eof(sfp); ++sfp->pos) { switch (*sfp->pos) { case '_': case '-': case '.': case '*': DIGIT_CASES: LCALPHA_CASES: continue; } break; } #ifdef __AVX2__ fin: #endif /* __AVX2__ */ if (dest) { dest->base = (uint8_t *)base; dest->len = (size_t)(sfp->pos - dest->base); } return 0; } static int parser_number(sfparse_parser *sfp, sfparse_value *dest) { int sign = 1; int64_t value = 0; size_t len = 0; size_t fpos = 0; if (*sfp->pos == '-') { ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } sign = -1; } assert(!parser_eof(sfp)); for (; !parser_eof(sfp); ++sfp->pos) { switch (*sfp->pos) { DIGIT_CASES: if (++len > 15) { return SFPARSE_ERR_PARSE; } value *= 10; value += *sfp->pos - '0'; continue; } break; } if (len == 0) { return SFPARSE_ERR_PARSE; } if (parser_eof(sfp) || *sfp->pos != '.') { if (dest) { dest->type = SFPARSE_TYPE_INTEGER; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->integer = value * sign; } return 0; } /* decimal */ if (len > 12) { return SFPARSE_ERR_PARSE; } fpos = len; ++sfp->pos; for (; !parser_eof(sfp); ++sfp->pos) { switch (*sfp->pos) { DIGIT_CASES: if (++len > 15) { return SFPARSE_ERR_PARSE; } value *= 10; value += *sfp->pos - '0'; continue; } break; } if (fpos == len || len - fpos > 3) { return SFPARSE_ERR_PARSE; } if (dest) { dest->type = SFPARSE_TYPE_DECIMAL; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->decimal.numer = value * sign; switch (len - fpos) { case 1: dest->decimal.denom = 10; break; case 2: dest->decimal.denom = 100; break; case 3: dest->decimal.denom = 1000; break; } } return 0; } static int parser_date(sfparse_parser *sfp, sfparse_value *dest) { int rv; sfparse_value val; /* The first byte has already been validated by the caller. */ assert('@' == *sfp->pos); ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } rv = parser_number(sfp, &val); if (rv != 0) { return rv; } if (val.type != SFPARSE_TYPE_INTEGER) { return SFPARSE_ERR_PARSE; } if (dest) { *dest = val; dest->type = SFPARSE_TYPE_DATE; } return 0; } #ifdef __AVX2__ static const uint8_t *find_char_string(const uint8_t *first, const uint8_t *last) { const __m256i bs = _mm256_set1_epi8('\\'); const __m256i dq = _mm256_set1_epi8('"'); const __m256i del = _mm256_set1_epi8(0x7f); const __m256i sp = _mm256_set1_epi8(' '); __m256i s, x; uint32_t m; for (; first != last; first += 32) { s = _mm256_loadu_si256((void *)first); x = _mm256_cmpgt_epi8(sp, s); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, bs), x); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, dq), x); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, del), x); m = (uint32_t)_mm256_movemask_epi8(x); if (m) { return first + ctz(m); } } return last; } #endif /* __AVX2__ */ static int parser_string(sfparse_parser *sfp, sfparse_value *dest) { const uint8_t *base; #ifdef __AVX2__ const uint8_t *last; #endif /* __AVX2__ */ uint32_t flags = SFPARSE_VALUE_FLAG_NONE; /* The first byte has already been validated by the caller. */ assert('"' == *sfp->pos); base = ++sfp->pos; #ifdef __AVX2__ for (; sfp->end - sfp->pos >= 32; ++sfp->pos) { last = sfp->pos + ((sfp->end - sfp->pos) & ~0x1fu); sfp->pos = find_char_string(sfp->pos, last); if (sfp->pos == last) { break; } switch (*sfp->pos) { case '\\': ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } switch (*sfp->pos) { case '"': case '\\': flags = SFPARSE_VALUE_FLAG_ESCAPED_STRING; break; default: return SFPARSE_ERR_PARSE; } break; case '"': goto fin; default: return SFPARSE_ERR_PARSE; } } #endif /* __AVX2__ */ for (; !parser_eof(sfp); ++sfp->pos) { switch (*sfp->pos) { X20_21_CASES: X23_5B_CASES: X5D_7E_CASES: break; case '\\': ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } switch (*sfp->pos) { case '"': case '\\': flags = SFPARSE_VALUE_FLAG_ESCAPED_STRING; break; default: return SFPARSE_ERR_PARSE; } break; case '"': goto fin; default: return SFPARSE_ERR_PARSE; } } return SFPARSE_ERR_PARSE; fin: if (dest) { dest->type = SFPARSE_TYPE_STRING; dest->flags = flags; dest->vec.len = (size_t)(sfp->pos - base); dest->vec.base = dest->vec.len == 0 ? NULL : (uint8_t *)base; } ++sfp->pos; return 0; } #ifdef __AVX2__ static const uint8_t *find_char_token(const uint8_t *first, const uint8_t *last) { /* r0: !..:, excluding "(), r1: A..Z r2: ^..~, excluding {} */ const __m256i r0l = _mm256_set1_epi8('!' - 1); const __m256i r0r = _mm256_set1_epi8(':' + 1); const __m256i dq = _mm256_set1_epi8('"'); const __m256i prl = _mm256_set1_epi8('('); const __m256i prr = _mm256_set1_epi8(')'); const __m256i comma = _mm256_set1_epi8(','); const __m256i r1l = _mm256_set1_epi8('A' - 1); const __m256i r1r = _mm256_set1_epi8('Z' + 1); const __m256i r2l = _mm256_set1_epi8('^' - 1); const __m256i r2r = _mm256_set1_epi8('~' + 1); const __m256i cbl = _mm256_set1_epi8('{'); const __m256i cbr = _mm256_set1_epi8('}'); __m256i s, x; uint32_t m; for (; first != last; first += 32) { s = _mm256_loadu_si256((void *)first); x = _mm256_andnot_si256( _mm256_cmpeq_epi8(s, comma), _mm256_andnot_si256( _mm256_cmpeq_epi8(s, prr), _mm256_andnot_si256( _mm256_cmpeq_epi8(s, prl), _mm256_andnot_si256(_mm256_cmpeq_epi8(s, dq), _mm256_and_si256(_mm256_cmpgt_epi8(s, r0l), _mm256_cmpgt_epi8(r0r, s)))))); x = _mm256_or_si256( _mm256_and_si256(_mm256_cmpgt_epi8(s, r1l), _mm256_cmpgt_epi8(r1r, s)), x); x = _mm256_or_si256( _mm256_andnot_si256( _mm256_cmpeq_epi8(s, cbr), _mm256_andnot_si256(_mm256_cmpeq_epi8(s, cbl), _mm256_and_si256(_mm256_cmpgt_epi8(s, r2l), _mm256_cmpgt_epi8(r2r, s)))), x); m = ~(uint32_t)_mm256_movemask_epi8(x); if (m) { return first + ctz(m); } } return last; } #endif /* __AVX2__ */ static int parser_token(sfparse_parser *sfp, sfparse_value *dest) { const uint8_t *base; #ifdef __AVX2__ const uint8_t *last; #endif /* __AVX2__ */ /* The first byte has already been validated by the caller. */ base = sfp->pos++; #ifdef __AVX2__ if (sfp->end - sfp->pos >= 32) { last = sfp->pos + ((sfp->end - sfp->pos) & ~0x1fu); sfp->pos = find_char_token(sfp->pos, last); if (sfp->pos != last) { goto fin; } } #endif /* __AVX2__ */ for (; !parser_eof(sfp); ++sfp->pos) { switch (*sfp->pos) { TOKEN_CASES: continue; } break; } #ifdef __AVX2__ fin: #endif /* __AVX2__ */ if (dest) { dest->type = SFPARSE_TYPE_TOKEN; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->vec.base = (uint8_t *)base; dest->vec.len = (size_t)(sfp->pos - base); } return 0; } #ifdef __AVX2__ static const uint8_t *find_char_byteseq(const uint8_t *first, const uint8_t *last) { const __m256i pls = _mm256_set1_epi8('+'); const __m256i fs = _mm256_set1_epi8('/'); const __m256i r0l = _mm256_set1_epi8('0' - 1); const __m256i r0r = _mm256_set1_epi8('9' + 1); const __m256i r1l = _mm256_set1_epi8('A' - 1); const __m256i r1r = _mm256_set1_epi8('Z' + 1); const __m256i r2l = _mm256_set1_epi8('a' - 1); const __m256i r2r = _mm256_set1_epi8('z' + 1); __m256i s, x; uint32_t m; for (; first != last; first += 32) { s = _mm256_loadu_si256((void *)first); x = _mm256_cmpeq_epi8(s, pls); x = _mm256_or_si256(_mm256_cmpeq_epi8(s, fs), x); x = _mm256_or_si256( _mm256_and_si256(_mm256_cmpgt_epi8(s, r0l), _mm256_cmpgt_epi8(r0r, s)), x); x = _mm256_or_si256( _mm256_and_si256(_mm256_cmpgt_epi8(s, r1l), _mm256_cmpgt_epi8(r1r, s)), x); x = _mm256_or_si256( _mm256_and_si256(_mm256_cmpgt_epi8(s, r2l), _mm256_cmpgt_epi8(r2r, s)), x); m = ~(uint32_t)_mm256_movemask_epi8(x); if (m) { return first + ctz(m); } } return last; } #endif /* __AVX2__ */ static int parser_byteseq(sfparse_parser *sfp, sfparse_value *dest) { const uint8_t *base; #ifdef __AVX2__ const uint8_t *last; #endif /* __AVX2__ */ /* The first byte has already been validated by the caller. */ assert(':' == *sfp->pos); base = ++sfp->pos; #ifdef __AVX2__ if (sfp->end - sfp->pos >= 32) { last = sfp->pos + ((sfp->end - sfp->pos) & ~0x1fu); sfp->pos = find_char_byteseq(sfp->pos, last); } #endif /* __AVX2__ */ for (; !parser_eof(sfp); ++sfp->pos) { switch (*sfp->pos) { case '+': case '/': DIGIT_CASES: ALPHA_CASES: continue; case '=': switch ((sfp->pos - base) & 0x3) { case 0: case 1: return SFPARSE_ERR_PARSE; case 2: ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } if (*sfp->pos == '=') { ++sfp->pos; } break; case 3: ++sfp->pos; break; } if (parser_eof(sfp) || *sfp->pos != ':') { return SFPARSE_ERR_PARSE; } goto fin; case ':': if (((sfp->pos - base) & 0x3) == 1) { return SFPARSE_ERR_PARSE; } goto fin; default: return SFPARSE_ERR_PARSE; } } return SFPARSE_ERR_PARSE; fin: if (dest) { dest->type = SFPARSE_TYPE_BYTESEQ; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->vec.len = (size_t)(sfp->pos - base); dest->vec.base = dest->vec.len == 0 ? NULL : (uint8_t *)base; } ++sfp->pos; return 0; } static int parser_boolean(sfparse_parser *sfp, sfparse_value *dest) { int b; /* The first byte has already been validated by the caller. */ assert('?' == *sfp->pos); ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } switch (*sfp->pos) { case '0': b = 0; break; case '1': b = 1; break; default: return SFPARSE_ERR_PARSE; } ++sfp->pos; if (dest) { dest->type = SFPARSE_TYPE_BOOLEAN; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->boolean = b; } return 0; } static int pctdecode(uint8_t *pc, const uint8_t **ppos) { uint8_t c, b = **ppos; switch (b) { DIGIT_CASES: c = (uint8_t)((b - '0') << 4); break; LCHEXALPHA_CASES: c = (uint8_t)((b - 'a' + 10) << 4); break; default: return -1; } b = *++*ppos; switch (b) { DIGIT_CASES: c |= (uint8_t)(b - '0'); break; LCHEXALPHA_CASES: c |= (uint8_t)(b - 'a' + 10); break; default: return -1; } *pc = c; ++*ppos; return 0; } /* Start of utf8 dfa */ /* Copyright (c) 2008-2010 Bjoern Hoehrmann * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. * * Copyright (c) 2008-2009 Bjoern Hoehrmann * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define UTF8_ACCEPT 0 #define UTF8_REJECT 12 /* clang-format off */ static const uint8_t utf8d[] = { /* * The first part of the table maps bytes to character classes that * to reduce the size of the transition table and create bitmasks. */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /* * The second part is a transition table that maps a combination * of a state of the automaton and a character class to a state. */ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12, }; /* clang-format on */ static void utf8_decode(uint32_t *state, uint8_t byte) { *state = utf8d[256 + *state + utf8d[byte]]; } /* End of utf8 dfa */ static int parser_dispstring(sfparse_parser *sfp, sfparse_value *dest) { const uint8_t *base; uint8_t c; uint32_t utf8state = UTF8_ACCEPT; assert('%' == *sfp->pos); ++sfp->pos; if (parser_eof(sfp) || *sfp->pos != '"') { return SFPARSE_ERR_PARSE; } base = ++sfp->pos; for (; !parser_eof(sfp);) { switch (*sfp->pos) { X00_1F_CASES: X7F_FF_CASES: return SFPARSE_ERR_PARSE; case '%': ++sfp->pos; if (sfp->pos + 2 > sfp->end) { return SFPARSE_ERR_PARSE; } if (pctdecode(&c, &sfp->pos) != 0) { return SFPARSE_ERR_PARSE; } utf8_decode(&utf8state, c); if (utf8state == UTF8_REJECT) { return SFPARSE_ERR_PARSE; } break; case '"': if (utf8state != UTF8_ACCEPT) { return SFPARSE_ERR_PARSE; } if (dest) { dest->type = SFPARSE_TYPE_DISPSTRING; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->vec.len = (size_t)(sfp->pos - base); dest->vec.base = dest->vec.len == 0 ? NULL : (uint8_t *)base; } ++sfp->pos; return 0; default: if (utf8state != UTF8_ACCEPT) { return SFPARSE_ERR_PARSE; } ++sfp->pos; } } return SFPARSE_ERR_PARSE; } static int parser_bare_item(sfparse_parser *sfp, sfparse_value *dest) { switch (*sfp->pos) { case '"': return parser_string(sfp, dest); case '-': DIGIT_CASES: return parser_number(sfp, dest); case '@': return parser_date(sfp, dest); case ':': return parser_byteseq(sfp, dest); case '?': return parser_boolean(sfp, dest); case '*': ALPHA_CASES: return parser_token(sfp, dest); case '%': return parser_dispstring(sfp, dest); default: return SFPARSE_ERR_PARSE; } } static int parser_skip_inner_list(sfparse_parser *sfp); int sfparse_parser_param(sfparse_parser *sfp, sfparse_vec *dest_key, sfparse_value *dest_value) { int rv; switch (sfp->state & SFPARSE_STATE_OP_MASK) { case SFPARSE_STATE_BEFORE: rv = parser_skip_inner_list(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_BEFORE_PARAMS: parser_set_op_state(sfp, SFPARSE_STATE_PARAMS); break; case SFPARSE_STATE_PARAMS: break; default: assert(0); abort(); } if (parser_eof(sfp) || *sfp->pos != ';') { parser_set_op_state(sfp, SFPARSE_STATE_AFTER); return SFPARSE_ERR_EOF; } ++sfp->pos; parser_discard_sp(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } rv = parser_key(sfp, dest_key); if (rv != 0) { return rv; } if (parser_eof(sfp) || *sfp->pos != '=') { if (dest_value) { dest_value->type = SFPARSE_TYPE_BOOLEAN; dest_value->flags = SFPARSE_VALUE_FLAG_NONE; dest_value->boolean = 1; } return 0; } ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } return parser_bare_item(sfp, dest_value); } static int parser_skip_params(sfparse_parser *sfp) { int rv; for (;;) { rv = sfparse_parser_param(sfp, NULL, NULL); switch (rv) { case 0: break; case SFPARSE_ERR_EOF: return 0; case SFPARSE_ERR_PARSE: return rv; default: assert(0); abort(); } } } int sfparse_parser_inner_list(sfparse_parser *sfp, sfparse_value *dest) { int rv; switch (sfp->state & SFPARSE_STATE_OP_MASK) { case SFPARSE_STATE_BEFORE: parser_discard_sp(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } break; case SFPARSE_STATE_BEFORE_PARAMS: rv = parser_skip_params(sfp); if (rv != 0) { return rv; } /* Technically, we are entering SFPARSE_STATE_AFTER, but we will set another state without reading the state. */ /* parser_set_op_state(sfp, SFPARSE_STATE_AFTER); */ /* fall through */ case SFPARSE_STATE_AFTER: if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } switch (*sfp->pos) { case ' ': parser_discard_sp(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } break; case ')': break; default: return SFPARSE_ERR_PARSE; } break; default: assert(0); abort(); } if (*sfp->pos == ')') { ++sfp->pos; parser_unset_inner_list_state(sfp); parser_set_op_state(sfp, SFPARSE_STATE_BEFORE_PARAMS); return SFPARSE_ERR_EOF; } rv = parser_bare_item(sfp, dest); if (rv != 0) { return rv; } parser_set_op_state(sfp, SFPARSE_STATE_BEFORE_PARAMS); return 0; } static int parser_skip_inner_list(sfparse_parser *sfp) { int rv; for (;;) { rv = sfparse_parser_inner_list(sfp, NULL); switch (rv) { case 0: break; case SFPARSE_ERR_EOF: return 0; case SFPARSE_ERR_PARSE: return rv; default: assert(0); abort(); } } } static int parser_next_key_or_item(sfparse_parser *sfp) { parser_discard_ows(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_EOF; } if (*sfp->pos != ',') { return SFPARSE_ERR_PARSE; } ++sfp->pos; parser_discard_ows(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } return 0; } static int parser_dict_value(sfparse_parser *sfp, sfparse_value *dest) { int rv; if (parser_eof(sfp) || *(sfp->pos) != '=') { /* Boolean true */ if (dest) { dest->type = SFPARSE_TYPE_BOOLEAN; dest->flags = SFPARSE_VALUE_FLAG_NONE; dest->boolean = 1; } sfp->state = SFPARSE_STATE_DICT_BEFORE_PARAMS; return 0; } ++sfp->pos; if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } if (*sfp->pos == '(') { if (dest) { dest->type = SFPARSE_TYPE_INNER_LIST; dest->flags = SFPARSE_VALUE_FLAG_NONE; } ++sfp->pos; sfp->state = SFPARSE_STATE_DICT_INNER_LIST_BEFORE; return 0; } rv = parser_bare_item(sfp, dest); if (rv != 0) { return rv; } sfp->state = SFPARSE_STATE_DICT_BEFORE_PARAMS; return 0; } int sfparse_parser_dict(sfparse_parser *sfp, sfparse_vec *dest_key, sfparse_value *dest_value) { int rv; switch (sfp->state) { case SFPARSE_STATE_DICT_INNER_LIST_BEFORE: rv = parser_skip_inner_list(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_DICT_BEFORE_PARAMS: rv = parser_skip_params(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_DICT_AFTER: rv = parser_next_key_or_item(sfp); if (rv != 0) { return rv; } break; case SFPARSE_STATE_INITIAL: parser_discard_sp(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_EOF; } break; default: assert(0); abort(); } rv = parser_key(sfp, dest_key); if (rv != 0) { return rv; } return parser_dict_value(sfp, dest_value); } int sfparse_parser_list(sfparse_parser *sfp, sfparse_value *dest) { int rv; switch (sfp->state) { case SFPARSE_STATE_LIST_INNER_LIST_BEFORE: rv = parser_skip_inner_list(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_LIST_BEFORE_PARAMS: rv = parser_skip_params(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_LIST_AFTER: rv = parser_next_key_or_item(sfp); if (rv != 0) { return rv; } break; case SFPARSE_STATE_INITIAL: parser_discard_sp(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_EOF; } break; default: assert(0); abort(); } if (*sfp->pos == '(') { if (dest) { dest->type = SFPARSE_TYPE_INNER_LIST; dest->flags = SFPARSE_VALUE_FLAG_NONE; } ++sfp->pos; sfp->state = SFPARSE_STATE_LIST_INNER_LIST_BEFORE; return 0; } rv = parser_bare_item(sfp, dest); if (rv != 0) { return rv; } sfp->state = SFPARSE_STATE_LIST_BEFORE_PARAMS; return 0; } int sfparse_parser_item(sfparse_parser *sfp, sfparse_value *dest) { int rv; switch (sfp->state) { case SFPARSE_STATE_INITIAL: parser_discard_sp(sfp); if (parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } break; case SFPARSE_STATE_ITEM_INNER_LIST_BEFORE: rv = parser_skip_inner_list(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_ITEM_BEFORE_PARAMS: rv = parser_skip_params(sfp); if (rv != 0) { return rv; } /* fall through */ case SFPARSE_STATE_ITEM_AFTER: parser_discard_sp(sfp); if (!parser_eof(sfp)) { return SFPARSE_ERR_PARSE; } return SFPARSE_ERR_EOF; default: assert(0); abort(); } if (*sfp->pos == '(') { if (dest) { dest->type = SFPARSE_TYPE_INNER_LIST; dest->flags = SFPARSE_VALUE_FLAG_NONE; } ++sfp->pos; sfp->state = SFPARSE_STATE_ITEM_INNER_LIST_BEFORE; return 0; } rv = parser_bare_item(sfp, dest); if (rv != 0) { return rv; } sfp->state = SFPARSE_STATE_ITEM_BEFORE_PARAMS; return 0; } void sfparse_parser_init(sfparse_parser *sfp, const uint8_t *data, size_t datalen) { if (datalen == 0) { sfp->pos = sfp->end = NULL; } else { sfp->pos = data; sfp->end = data + datalen; } sfp->state = SFPARSE_STATE_INITIAL; } void sfparse_unescape(sfparse_vec *dest, const sfparse_vec *src) { const uint8_t *p, *q; uint8_t *o; size_t len, slen; if (src->len == 0) { dest->len = 0; return; } o = dest->base; p = src->base; len = src->len; for (;;) { q = memchr(p, '\\', len); if (q == NULL) { memcpy(o, p, len); o += len; dest->len = (size_t)(o - dest->base); return; } slen = (size_t)(q - p); memcpy(o, p, slen); o += slen; p = q + 1; *o++ = *p++; len -= slen + 2; } } void sfparse_base64decode(sfparse_vec *dest, const sfparse_vec *src) { static const int index_tbl[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; uint8_t *o; const uint8_t *p, *end; uint32_t n; size_t i, left; int idx; if (src->len == 0) { dest->len = 0; return; } o = dest->base; p = src->base; left = src->len & 0x3; if (left == 0 && src->base[src->len - 1] == '=') { left = 4; } end = src->base + src->len - left; for (; p != end;) { n = 0; for (i = 1; i <= 4; ++i, ++p) { idx = index_tbl[*p]; assert(idx != -1); n += (uint32_t)(idx << (24 - i * 6)); } *o++ = (uint8_t)(n >> 16); *o++ = (n >> 8) & 0xffu; *o++ = n & 0xffu; } switch (left) { case 0: goto fin; case 1: assert(0); abort(); case 3: if (src->base[src->len - 1] == '=') { left = 2; } break; case 4: assert('=' == src->base[src->len - 1]); if (src->base[src->len - 2] == '=') { left = 2; } else { left = 3; } break; } switch (left) { case 2: *o = (uint8_t)(index_tbl[*p++] << 2); *o++ |= (uint8_t)(index_tbl[*p++] >> 4); break; case 3: n = (uint32_t)(index_tbl[*p++] << 10); n += (uint32_t)(index_tbl[*p++] << 4); n += (uint32_t)(index_tbl[*p++] >> 2); *o++ = (n >> 8) & 0xffu; *o++ = n & 0xffu; break; } fin: dest->len = (size_t)(o - dest->base); } void sfparse_pctdecode(sfparse_vec *dest, const sfparse_vec *src) { const uint8_t *p, *q; uint8_t *o; size_t len, slen; if (src->len == 0) { dest->len = 0; return; } o = dest->base; p = src->base; len = src->len; for (;;) { q = memchr(p, '%', len); if (q == NULL) { memcpy(o, p, len); o += len; dest->len = (size_t)(o - dest->base); return; } slen = (size_t)(q - p); memcpy(o, p, slen); o += slen; p = q + 1; pctdecode(o++, &p); len -= slen + 3; } }