reader.h 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677
  1. #pragma once
  2. #include "lexer_base.h"
  3. #include "symbols.h"
  4. #include <library/cpp/yson_pull/reader.h>
  5. #include <util/generic/maybe.h>
  6. #include <util/generic/vector.h>
  7. namespace NYsonPull {
  8. namespace NDetail {
  9. /*! \internal */
  10. ////////////////////////////////////////////////////////////////////////////////
  11. enum class special_token : ui8 {
  12. // Special values:
  13. // YSON
  14. semicolon = 0, // ;
  15. equals = 1, // =
  16. hash = 2, // #
  17. left_bracket = 3, // [
  18. right_bracket = 4, // ]
  19. left_brace = 5, // {
  20. right_brace = 6, // }
  21. left_angle = 7, // <
  22. right_angle = 8, // >
  23. };
  24. // char_class tree representation:
  25. // Root = xb
  26. // BinaryStringOrOtherSpecialToken = x0b
  27. // BinaryString = 00b
  28. // OtherSpecialToken = 10b
  29. // Other = x1b
  30. // BinaryScalar = xx01b
  31. // BinaryInt64 = 0001b
  32. // BinaryDouble = 0101b
  33. // BinaryFalse = 1001b
  34. // BinaryTrue = 1101b
  35. // Other = xxx11b
  36. // Quote = 00011b
  37. // DigitOrMinus = 00111b
  38. // String = 01011b
  39. // Space = 01111b
  40. // Plus = 10011b
  41. // None = 10111b
  42. // Percent = 11011b
  43. enum class char_class : ui8 {
  44. binary_string = 0, // = 00b
  45. special_token_mask = 2, // = 10b
  46. semicolon = 2 + (0 << 2),
  47. equals = 2 + (1 << 2),
  48. hash = 2 + (2 << 2),
  49. left_bracket = 2 + (3 << 2),
  50. right_bracket = 2 + (4 << 2),
  51. left_brace = 2 + (5 << 2),
  52. right_brace = 2 + (6 << 2),
  53. left_angle = 2 + (7 << 2),
  54. right_angle = 2 + (8 << 2),
  55. binary_scalar_mask = 1,
  56. binary_int64 = 1 + (0 << 2), // = 001b
  57. binary_double = 1 + (1 << 2), // = 101b
  58. binary_false = 1 + (2 << 2), // = 1001b
  59. binary_true = 1 + (3 << 2), // = 1101b
  60. binary_uint64 = 1 + (4 << 2), // = 10001b
  61. other_mask = 3,
  62. quote = 3 + (0 << 2), // = 00011b
  63. number = 3 + (1 << 2), // = 00111b
  64. string = 3 + (2 << 2), // = 01011b
  65. percent = 3 + (6 << 2), // = 11011b
  66. none = 3 + (5 << 2), // = 10111b
  67. };
  68. #define CHAR_SUBCLASS(x) (static_cast<ui8>(x) >> 2)
  69. inline char_class get_char_class(ui8 ch) {
  70. #define NN char_class::none
  71. #define BS char_class::binary_string
  72. #define BI char_class::binary_int64
  73. #define BD char_class::binary_double
  74. #define BF char_class::binary_false
  75. #define BT char_class::binary_true
  76. #define BU char_class::binary_uint64
  77. #define SP NN // char_class::space
  78. #define NB char_class::number
  79. #define ST char_class::string
  80. #define QU char_class::quote
  81. #define PC char_class::percent
  82. #define TT(name) (static_cast<char_class>( \
  83. (static_cast<ui8>(special_token::name) << 2) | static_cast<ui8>(char_class::special_token_mask)))
  84. static constexpr char_class lookup[256] =
  85. {
  86. NN, BS, BI, BD, BF, BT, BU, NN, NN, SP, SP, SP, SP, SP, NN, NN,
  87. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  88. // 32
  89. SP, // ' '
  90. NN, // '!'
  91. QU, // '"'
  92. TT(hash), // '#'
  93. NN, // '$'
  94. PC, // '%'
  95. NN, // '&'
  96. NN, // "'"
  97. NN, // '('
  98. NN, // ')'
  99. NN, // '*'
  100. NB, // '+'
  101. NN, // ','
  102. NB, // '-'
  103. NN, // '.'
  104. NN, // '/'
  105. // 48
  106. NB, NB, NB, NB, NB, NB, NB, NB, NB, NB, // '0' - '9'
  107. NN, // ':'
  108. TT(semicolon), // ';'
  109. TT(left_angle), // '<'
  110. TT(equals), // '='
  111. TT(right_angle), // '>'
  112. NN, // '?'
  113. // 64
  114. NN, // '@'
  115. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'A' - 'M'
  116. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'N' - 'Z'
  117. TT(left_bracket), // '['
  118. NN, // '\'
  119. TT(right_bracket), // ']'
  120. NN, // '^'
  121. ST, // '_'
  122. // 96
  123. NN, // '`'
  124. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'a' - 'm'
  125. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'n' - 'z'
  126. TT(left_brace), // '{'
  127. NN, // '|'
  128. TT(right_brace), // '}'
  129. NN, // '~'
  130. NN, // '^?' non-printable
  131. // 128
  132. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  133. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  134. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  135. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  136. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  137. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  138. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  139. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN};
  140. #undef NN
  141. #undef BS
  142. #undef BI
  143. #undef BD
  144. #undef SP
  145. #undef NB
  146. #undef ST
  147. #undef QU
  148. #undef TT
  149. return lookup[ch];
  150. }
  151. template <bool EnableLinePositionInfo>
  152. class gen_reader_impl {
  153. enum class state {
  154. delimiter = 0, //! expecting ';' or closing-char ('>', ']', '}')
  155. maybe_value = 1, //! expecting a value or closing-char
  156. maybe_key = 2, //! expecting a key or closing-char
  157. equals = 3, //! expecting '=' (followed by value)
  158. value = 4, //! expecting a value
  159. value_noattr = 5, //! expecting a value w/o attrs (after attrs)
  160. // by design, rare states have numbers starting from first_rare_state
  161. first_rare_state = 6,
  162. before_begin = first_rare_state, //! before started reading the stream
  163. before_end = first_rare_state + 1, //! Expecting end of stream
  164. after_end = first_rare_state + 2, //! after end of stream
  165. };
  166. lexer_base<EnableLinePositionInfo> lexer_;
  167. state state_;
  168. TEvent event_;
  169. TVector<EEventType> stack_;
  170. EStreamType mode_;
  171. public:
  172. gen_reader_impl(
  173. NYsonPull::NInput::IStream& buffer,
  174. EStreamType mode,
  175. TMaybe<size_t> memoryLimit = {})
  176. : lexer_(buffer, memoryLimit)
  177. , state_{state::before_begin}
  178. , mode_{mode} {
  179. }
  180. const TEvent& last_event() const {
  181. return event_;
  182. }
  183. ATTRIBUTE(hot)
  184. const TEvent& next_event() {
  185. if (Y_LIKELY(state_ < state::first_rare_state)) {
  186. // 'hot' handler for in-stream events
  187. next_event_hot();
  188. } else {
  189. // these events happen no more than once per stream
  190. next_event_cold();
  191. }
  192. return event_;
  193. }
  194. private:
  195. ATTRIBUTE(hot)
  196. void next_event_hot() {
  197. auto ch = lexer_.get_byte();
  198. auto cls = get_char_class(ch);
  199. if (Y_UNLIKELY(cls == char_class::none)) {
  200. ch = lexer_.skip_space_and_get_byte();
  201. if (Y_UNLIKELY(ch == NSymbol::eof)) {
  202. handle_eof();
  203. return;
  204. }
  205. cls = get_char_class(ch);
  206. }
  207. // states maybe_value/value/value_noattr are distinguished
  208. // later in state_value_special
  209. switch (state_) {
  210. case state::maybe_value:
  211. state_value(ch, cls);
  212. break;
  213. case state::maybe_key:
  214. state_maybe_key(ch, cls);
  215. break;
  216. case state::equals:
  217. state_equals(ch);
  218. break;
  219. case state::value:
  220. state_value(ch, cls);
  221. break;
  222. case state::value_noattr:
  223. state_value(ch, cls);
  224. break;
  225. case state::delimiter:
  226. state_delimiter(ch, cls);
  227. break;
  228. default:
  229. Y_UNREACHABLE();
  230. }
  231. }
  232. ATTRIBUTE(noinline, cold)
  233. void next_event_cold() {
  234. switch (state_) {
  235. case state::before_begin:
  236. state_before_begin();
  237. break;
  238. case state::after_end:
  239. lexer_.fail("Attempted read past stream end");
  240. case state::before_end:
  241. state_before_end();
  242. break;
  243. default:
  244. Y_UNREACHABLE();
  245. }
  246. }
  247. //! Present a scalar value for caller
  248. template <typename T>
  249. void yield(T value) {
  250. event_ = TEvent{TScalar{value}};
  251. }
  252. //! Present a scalar value with non-scalar tag (i.e. key)
  253. template <typename T>
  254. void yield(EEventType type, T value) {
  255. event_ = TEvent{type, TScalar{value}};
  256. }
  257. //! Present a value from number variant
  258. void yield(const number& value) {
  259. switch (value.type) {
  260. case number_type::int64:
  261. yield(value.value.as_int64);
  262. break;
  263. case number_type::uint64:
  264. yield(value.value.as_uint64);
  265. break;
  266. case number_type::float64:
  267. yield(value.value.as_float64);
  268. break;
  269. }
  270. }
  271. //! Present a value from %-literal variant
  272. void yield(const percent_scalar& value) {
  273. switch (value.type) {
  274. case percent_scalar_type::boolean:
  275. yield(value.value.as_boolean);
  276. break;
  277. case percent_scalar_type::float64:
  278. yield(value.value.as_float64);
  279. break;
  280. }
  281. }
  282. //! Present a value-less event
  283. void yield(EEventType type) {
  284. event_ = TEvent{type};
  285. }
  286. //! Push the opening of a paired event
  287. void push(EEventType type) {
  288. stack_.push_back(type);
  289. }
  290. //! Close the paired_event, verify that delimiters are well-formed
  291. void pop(EEventType first, EEventType last) {
  292. if (Y_UNLIKELY(stack_.empty() || stack_.back() != first)) {
  293. pop_fail(first, last);
  294. return;
  295. }
  296. stack_.pop_back();
  297. yield(last);
  298. switch (first) {
  299. case EEventType::BeginList:
  300. next(state::delimiter);
  301. break;
  302. case EEventType::BeginMap:
  303. next(state::delimiter);
  304. break;
  305. case EEventType::BeginAttributes:
  306. next(state::value_noattr);
  307. break;
  308. case EEventType::BeginStream:
  309. next(state::after_end);
  310. break;
  311. default:
  312. Y_UNREACHABLE();
  313. }
  314. if (Y_UNLIKELY(mode_ == EStreamType::Node && stack_.size() == 1 && state_ == state::delimiter)) {
  315. next(state::before_end);
  316. }
  317. }
  318. ATTRIBUTE(noinline, cold)
  319. void pop_fail(EEventType first, EEventType last) {
  320. if (stack_.empty()) {
  321. lexer_.fail("Unpaired events: expected opening '", first, "' for '", last, "', but event stack is empty");
  322. } else {
  323. lexer_.fail("Unpaired events: expected opening '", first, "' for '", last, "', but '", stack_.back(), "' is found.");
  324. }
  325. }
  326. //! Transition to new_state
  327. void next(state new_state) {
  328. state_ = new_state;
  329. }
  330. bool in_map() {
  331. return (stack_.back() == EEventType::BeginMap) || (stack_.back() == EEventType::BeginAttributes) || (stack_.back() == EEventType::BeginStream && mode_ == EStreamType::MapFragment);
  332. }
  333. ATTRIBUTE(noinline, cold)
  334. void handle_eof() {
  335. switch (state_) {
  336. case state::maybe_value:
  337. case state::maybe_key:
  338. case state::delimiter:
  339. case state::before_end:
  340. pop(EEventType::BeginStream, EEventType::EndStream);
  341. return;
  342. default:
  343. lexer_.fail("Unexpected end of stream");
  344. }
  345. }
  346. ATTRIBUTE(noinline, cold)
  347. void state_before_begin() {
  348. push(EEventType::BeginStream);
  349. yield(EEventType::BeginStream);
  350. switch (mode_) {
  351. case EStreamType::Node:
  352. next(state::value);
  353. break;
  354. case EStreamType::ListFragment:
  355. next(state::maybe_value);
  356. break;
  357. case EStreamType::MapFragment:
  358. next(state::maybe_key);
  359. break;
  360. default:
  361. Y_UNREACHABLE();
  362. }
  363. }
  364. ATTRIBUTE(noinline, cold)
  365. void state_before_end() {
  366. auto ch = lexer_.skip_space_and_get_byte();
  367. if (ch == NSymbol::eof) {
  368. handle_eof();
  369. } else {
  370. lexer_.fail("Expected stream end, but found ", NCEscape::quote(ch));
  371. }
  372. }
  373. ATTRIBUTE(hot)
  374. void state_delimiter(ui8 ch, char_class cls) {
  375. if (Y_LIKELY(ch == NSymbol::item_separator)) {
  376. lexer_.advance(1);
  377. next(in_map() ? state::maybe_key : state::maybe_value);
  378. // immediately read next value
  379. next_event_hot();
  380. return;
  381. }
  382. state_delimiter_fallback(ch, cls);
  383. }
  384. ATTRIBUTE(noinline, hot)
  385. void state_delimiter_fallback(ui8 ch, char_class cls) {
  386. auto cls_bits = static_cast<ui8>(cls);
  387. if ((cls_bits & 3) == static_cast<ui8>(char_class::special_token_mask)) {
  388. auto token = static_cast<special_token>(cls_bits >> 2);
  389. lexer_.advance(1);
  390. switch (token) {
  391. /* // handled in the fast track
  392. case special_token::semicolon:
  393. next(in_map()? state::maybe_key : state::maybe_value);
  394. // immediately read next value
  395. return next_event();
  396. */
  397. case special_token::right_bracket:
  398. pop(EEventType::BeginList, EEventType::EndList);
  399. return;
  400. case special_token::right_brace:
  401. pop(EEventType::BeginMap, EEventType::EndMap);
  402. return;
  403. case special_token::right_angle:
  404. pop(EEventType::BeginAttributes, EEventType::EndAttributes);
  405. return;
  406. default:
  407. break;
  408. }
  409. }
  410. COLD_BLOCK_BYVALUE
  411. lexer_.fail(
  412. "Unexpected ", NCEscape::quote(ch), ", expected one of ",
  413. NCEscape::quote(NSymbol::item_separator), ", ",
  414. NCEscape::quote(NSymbol::end_list), ", ",
  415. NCEscape::quote(NSymbol::end_map), ", ",
  416. NCEscape::quote(NSymbol::end_attributes));
  417. COLD_BLOCK_END
  418. }
  419. ATTRIBUTE(noinline, hot)
  420. void state_maybe_key(ui8 ch, char_class cls) {
  421. auto key = TStringBuf{};
  422. // Keys are always strings, put binary-string key into fast lane
  423. if (Y_LIKELY(ch == NSymbol::string_marker)) {
  424. lexer_.advance(1);
  425. key = lexer_.read_binary_string();
  426. } else {
  427. switch (cls) {
  428. case char_class::quote:
  429. lexer_.advance(1);
  430. key = lexer_.read_quoted_string();
  431. break;
  432. case char_class::string:
  433. key = lexer_.read_unquoted_string();
  434. break;
  435. case char_class::right_brace:
  436. lexer_.advance(1);
  437. pop(EEventType::BeginMap, EEventType::EndMap);
  438. return;
  439. case char_class::right_angle:
  440. lexer_.advance(1);
  441. pop(EEventType::BeginAttributes, EEventType::EndAttributes);
  442. return;
  443. default:
  444. COLD_BLOCK_BYVALUE
  445. lexer_.fail("Unexpected ", NCEscape::quote(ch), ", expected key string");
  446. COLD_BLOCK_END
  447. }
  448. }
  449. yield(EEventType::Key, key);
  450. next(state::equals);
  451. }
  452. ATTRIBUTE(hot)
  453. void state_equals(ui8 ch) {
  454. // skip '='
  455. if (Y_UNLIKELY(ch != NSymbol::key_value_separator)) {
  456. COLD_BLOCK_BYVALUE
  457. lexer_.fail("Unexpected ", NCEscape::quote(ch), ", expected ", NCEscape::quote(NSymbol::key_value_separator));
  458. COLD_BLOCK_END
  459. }
  460. lexer_.advance(1);
  461. next(state::value);
  462. // immediately read the following value
  463. // (this symbol yields no result)
  464. next_event_hot();
  465. }
  466. ATTRIBUTE(noinline, hot)
  467. void state_value(ui8 ch, char_class cls) {
  468. auto cls_bits = static_cast<ui8>(cls);
  469. if (cls_bits & 1) { // Other = x1b
  470. if (cls_bits & (1 << 1)) { // Other = xxx11b
  471. state_value_text_scalar(cls);
  472. } else { // BinaryScalar = x01b
  473. state_value_binary_scalar(cls);
  474. }
  475. next(state::delimiter);
  476. } else { // BinaryStringOrOtherSpecialToken = x0b
  477. lexer_.advance(1);
  478. if (cls_bits & 1 << 1) {
  479. // special token
  480. auto token = static_cast<special_token>(cls_bits >> 2);
  481. state_value_special(token, ch);
  482. } else {
  483. // binary string
  484. yield(lexer_.read_binary_string());
  485. next(state::delimiter);
  486. }
  487. }
  488. }
  489. ATTRIBUTE(noinline)
  490. void state_value_special(special_token token, ui8 ch) {
  491. // Value starters are always accepted values
  492. switch (token) {
  493. case special_token::hash:
  494. yield(TScalar{});
  495. next(state::delimiter);
  496. return;
  497. case special_token::left_bracket:
  498. push(EEventType::BeginList);
  499. yield(EEventType::BeginList);
  500. next(state::maybe_value);
  501. return;
  502. case special_token::left_brace:
  503. push(EEventType::BeginMap);
  504. yield(EEventType::BeginMap);
  505. next(state::maybe_key);
  506. return;
  507. default:
  508. break;
  509. }
  510. // ...closing-chars are only allowed in maybe_value state
  511. if (state_ == state::maybe_value) {
  512. switch (token) {
  513. case special_token::right_bracket:
  514. pop(EEventType::BeginList, EEventType::EndList);
  515. return;
  516. case special_token::right_brace:
  517. pop(EEventType::BeginMap, EEventType::EndMap);
  518. return;
  519. // right_angle is impossible in maybe_value state
  520. // (only in delimiter, maybe_key)
  521. default:
  522. break;
  523. }
  524. }
  525. // attributes are not allowed after attributes (thus, value_noattr state)
  526. if (state_ != state::value_noattr && token == special_token::left_angle) {
  527. push(EEventType::BeginAttributes);
  528. yield(EEventType::BeginAttributes);
  529. next(state::maybe_key);
  530. return;
  531. }
  532. COLD_BLOCK_BYVALUE
  533. lexer_.fail("Unexpected ", NCEscape::quote(ch));
  534. COLD_BLOCK_END
  535. }
  536. ATTRIBUTE(hot)
  537. void state_value_binary_scalar(char_class cls) {
  538. lexer_.advance(1);
  539. switch (cls) {
  540. case char_class::binary_double:
  541. yield(lexer_.read_binary_double());
  542. break;
  543. case char_class::binary_int64:
  544. yield(lexer_.read_binary_int64());
  545. break;
  546. case char_class::binary_uint64:
  547. yield(lexer_.read_binary_uint64());
  548. break;
  549. case char_class::binary_false:
  550. yield(false);
  551. break;
  552. case char_class::binary_true:
  553. yield(true);
  554. break;
  555. default:
  556. Y_UNREACHABLE();
  557. }
  558. }
  559. ATTRIBUTE(noinline)
  560. void state_value_text_scalar(char_class cls) {
  561. switch (cls) {
  562. case char_class::quote:
  563. lexer_.advance(1);
  564. yield(lexer_.read_quoted_string());
  565. break;
  566. case char_class::number:
  567. yield(lexer_.read_numeric());
  568. break;
  569. case char_class::string:
  570. yield(lexer_.read_unquoted_string());
  571. break;
  572. case char_class::percent:
  573. lexer_.advance(1);
  574. yield(lexer_.read_percent_scalar());
  575. break;
  576. case char_class::none:
  577. COLD_BLOCK_BYVALUE
  578. lexer_.fail("Invalid yson value.");
  579. COLD_BLOCK_END
  580. break;
  581. default:
  582. Y_UNREACHABLE();
  583. }
  584. }
  585. };
  586. class reader_impl: public gen_reader_impl<false> {
  587. public:
  588. using gen_reader_impl<false>::gen_reader_impl;
  589. };
  590. }
  591. }