lexer_detail.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. #pragma once
  2. #include "detail.h"
  3. #include "token.h"
  4. namespace NYson {
  5. ////////////////////////////////////////////////////////////////////////////////
  6. namespace NDetail {
  7. /*! \internal */
  8. ////////////////////////////////////////////////////////////////////////////////
  9. // EReadStartCase tree representation:
  10. // Root = xb
  11. // BinaryStringOrOtherSpecialToken = x0b
  12. // BinaryString = 00b
  13. // OtherSpecialToken = 10b
  14. // Other = x1b
  15. // BinaryScalar = xx01b
  16. // BinaryInt64 = 0001b
  17. // BinaryDouble = 0101b
  18. // BinaryFalse = 1001b
  19. // BinaryTrue = 1101b
  20. // Other = xxx11b
  21. // Quote = 00011b
  22. // DigitOrMinus = 00111b
  23. // String = 01011b
  24. // Space = 01111b
  25. // Plus = 10011b
  26. // None = 10111b
  27. // Percent = 11011b
  28. enum EReadStartCase : unsigned {
  29. BinaryString = 0, // = 00b
  30. OtherSpecialToken = 2, // = 10b
  31. BinaryInt64 = 1, // = 001b
  32. BinaryDouble = 5, // = 101b
  33. BinaryFalse = 9, // = 1001b
  34. BinaryTrue = 13, // = 1101b
  35. BinaryUint64 = 17, // = 10001b
  36. Quote = 3, // = 00011b
  37. DigitOrMinus = 7, // = 00111b
  38. String = 11, // = 01011b
  39. Space = 15, // = 01111b
  40. Plus = 19, // = 10011b
  41. None = 23, // = 10111b
  42. Percent = 27 // = 11011b
  43. };
  44. template <class TBlockStream, bool EnableLinePositionInfo>
  45. class TLexer
  46. : public TLexerBase<TBlockStream, EnableLinePositionInfo> {
  47. private:
  48. using TBase = TLexerBase<TBlockStream, EnableLinePositionInfo>;
  49. static EReadStartCase GetStartState(char ch) {
  50. #define NN EReadStartCase::None
  51. #define BS EReadStartCase::BinaryString
  52. #define BI EReadStartCase::BinaryInt64
  53. #define BD EReadStartCase::BinaryDouble
  54. #define BF EReadStartCase::BinaryFalse
  55. #define BT EReadStartCase::BinaryTrue
  56. #define BU EReadStartCase::BinaryUint64
  57. #define SP NN // EReadStartCase::Space
  58. #define DM EReadStartCase::DigitOrMinus
  59. #define ST EReadStartCase::String
  60. #define PL EReadStartCase::Plus
  61. #define QU EReadStartCase::Quote
  62. #define PC EReadStartCase::Percent
  63. #define TT(name) (EReadStartCase(static_cast<ui8>(ETokenType::name) << 2) | EReadStartCase::OtherSpecialToken)
  64. static const ui8 lookupTable[] =
  65. {
  66. NN, BS, BI, BD, BF, BT, BU, NN, NN, SP, SP, SP, SP, SP, NN, NN,
  67. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  68. // 32
  69. SP, // ' '
  70. NN, // '!'
  71. QU, // '"'
  72. TT(Hash), // '#'
  73. NN, // '$'
  74. PC, // '%'
  75. NN, // '&'
  76. NN, // "'"
  77. TT(LeftParenthesis), // '('
  78. TT(RightParenthesis), // ')'
  79. NN, // '*'
  80. PL, // '+'
  81. TT(Comma), // ','
  82. DM, // '-'
  83. NN, // '.'
  84. NN, // '/'
  85. // 48
  86. DM, DM, DM, DM, DM, DM, DM, DM, DM, DM, // '0' - '9'
  87. TT(Colon), // ':'
  88. TT(Semicolon), // ';'
  89. TT(LeftAngle), // '<'
  90. TT(Equals), // '='
  91. TT(RightAngle), // '>'
  92. NN, // '?'
  93. // 64
  94. NN, // '@'
  95. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'A' - 'M'
  96. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'N' - 'Z'
  97. TT(LeftBracket), // '['
  98. NN, // '\'
  99. TT(RightBracket), // ']'
  100. NN, // '^'
  101. ST, // '_'
  102. // 96
  103. NN, // '`'
  104. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'a' - 'm'
  105. ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'n' - 'z'
  106. TT(LeftBrace), // '{'
  107. NN, // '|'
  108. TT(RightBrace), // '}'
  109. NN, // '~'
  110. NN, // '^?' non-printable
  111. // 128
  112. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  113. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  114. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  115. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  116. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  117. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  118. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
  119. NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN};
  120. #undef NN
  121. #undef BS
  122. #undef BI
  123. #undef BD
  124. #undef SP
  125. #undef DM
  126. #undef ST
  127. #undef PL
  128. #undef QU
  129. #undef TT
  130. return static_cast<EReadStartCase>(lookupTable[static_cast<ui8>(ch)]);
  131. }
  132. public:
  133. TLexer(const TBlockStream& blockStream, TMaybe<ui64> memoryLimit)
  134. : TBase(blockStream, memoryLimit)
  135. {
  136. }
  137. void GetToken(TToken* token) {
  138. char ch1 = TBase::SkipSpaceAndGetChar();
  139. auto state = GetStartState(ch1);
  140. auto stateBits = static_cast<unsigned>(state);
  141. if (ch1 == '\0') {
  142. *token = TToken::EndOfStream;
  143. return;
  144. }
  145. if (stateBits & 1) { // Other = x1b
  146. if (stateBits & 1 << 1) { // Other = xxx11b
  147. if (state == EReadStartCase::Quote) {
  148. TStringBuf value;
  149. TBase::Advance(1);
  150. TBase::ReadQuotedString(&value);
  151. *token = TToken(value);
  152. } else if (state == EReadStartCase::DigitOrMinus) {
  153. ReadNumeric<true>(token);
  154. } else if (state == EReadStartCase::Plus) {
  155. TBase::Advance(1);
  156. char ch2 = TBase::template GetChar<true>();
  157. if (!isdigit(ch2)) {
  158. *token = TToken(ETokenType::Plus);
  159. } else {
  160. ReadNumeric<true>(token);
  161. }
  162. } else if (state == EReadStartCase::String) {
  163. TStringBuf value;
  164. TBase::template ReadUnquotedString<true>(&value);
  165. *token = TToken(value);
  166. } else if (state == EReadStartCase::Percent) {
  167. TBase::Advance(1);
  168. char ch3 = TBase::template GetChar<true>();
  169. if (ch3 == 't' || ch3 == 'f') {
  170. *token = TToken(TBase::template ReadBoolean<true>());
  171. } else {
  172. *token = TToken(TBase::template ReadNanOrInf<true>());
  173. }
  174. } else { // None
  175. Y_ASSERT(state == EReadStartCase::None);
  176. ythrow TYsonException() << "Unexpected " << ch1;
  177. }
  178. } else { // BinaryScalar = x01b
  179. TBase::Advance(1);
  180. if (state == EReadStartCase::BinaryDouble) {
  181. double value;
  182. TBase::ReadBinaryDouble(&value);
  183. *token = TToken(value);
  184. } else if (state == EReadStartCase::BinaryInt64) {
  185. i64 value;
  186. TBase::ReadBinaryInt64(&value);
  187. *token = TToken(value);
  188. } else if (state == EReadStartCase::BinaryUint64) {
  189. ui64 value;
  190. TBase::ReadBinaryUint64(&value);
  191. *token = TToken(value);
  192. } else if (state == EReadStartCase::BinaryFalse) {
  193. *token = TToken(false);
  194. } else if (state == EReadStartCase::BinaryTrue) {
  195. *token = TToken(true);
  196. } else {
  197. Y_ABORT("unreachable");
  198. }
  199. }
  200. } else { // BinaryStringOrOtherSpecialToken = x0b
  201. TBase::Advance(1);
  202. if (stateBits & 1 << 1) { // OtherSpecialToken = 10b
  203. Y_ASSERT((stateBits & 3) == static_cast<unsigned>(EReadStartCase::OtherSpecialToken));
  204. *token = TToken(ETokenType(stateBits >> 2));
  205. } else { // BinaryString = 00b
  206. Y_ASSERT((stateBits & 3) == static_cast<unsigned>(EReadStartCase::BinaryString));
  207. TStringBuf value;
  208. TBase::ReadBinaryString(&value);
  209. *token = TToken(value);
  210. }
  211. }
  212. }
  213. template <bool AllowFinish>
  214. void ReadNumeric(TToken* token) {
  215. TStringBuf valueBuffer;
  216. ENumericResult numericResult = TBase::template ReadNumeric<AllowFinish>(&valueBuffer);
  217. if (numericResult == ENumericResult::Double) {
  218. try {
  219. *token = TToken(FromString<double>(valueBuffer));
  220. } catch (yexception&) {
  221. ythrow TYsonException() << "Error parsing double literal " << valueBuffer;
  222. }
  223. } else if (numericResult == ENumericResult::Int64) {
  224. try {
  225. *token = TToken(FromString<i64>(valueBuffer));
  226. } catch (yexception&) {
  227. ythrow TYsonException() << "Error parsing int64 literal " << valueBuffer;
  228. }
  229. } else if (numericResult == ENumericResult::Uint64) {
  230. try {
  231. *token = TToken(FromString<ui64>(valueBuffer.SubStr(0, valueBuffer.size() - 1)));
  232. } catch (yexception&) {
  233. ythrow TYsonException() << "Error parsing uint64 literal " << valueBuffer;
  234. }
  235. }
  236. }
  237. };
  238. ////////////////////////////////////////////////////////////////////////////////
  239. /*! \endinternal */
  240. }
  241. class TStatelessYsonLexerImplBase {
  242. public:
  243. virtual size_t GetToken(const TStringBuf& data, TToken* token) = 0;
  244. virtual ~TStatelessYsonLexerImplBase() {
  245. }
  246. };
  247. template <bool EnableLinePositionInfo>
  248. class TStatelesYsonLexerImpl: public TStatelessYsonLexerImplBase {
  249. private:
  250. using TLexer = NDetail::TLexer<TStringReader, EnableLinePositionInfo>;
  251. TLexer Lexer;
  252. public:
  253. TStatelesYsonLexerImpl()
  254. : Lexer(TStringReader(), Nothing())
  255. {
  256. }
  257. size_t GetToken(const TStringBuf& data, TToken* token) override {
  258. Lexer.SetBuffer(data.begin(), data.end());
  259. Lexer.GetToken(token);
  260. return Lexer.Begin() - data.begin();
  261. }
  262. };
  263. ////////////////////////////////////////////////////////////////////////////////
  264. } // namespace NYson