lexer_ut.cpp 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. #include "lexer.h"
  2. #include <yql/essentials/core/issue/yql_issue.h>
  3. #include <yql/essentials/sql/settings/translation_settings.h>
  4. #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
  5. #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
  6. #include <library/cpp/testing/unittest/registar.h>
  7. using namespace NSQLTranslation;
  8. using namespace NSQLTranslationV1;
  9. std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) {
  10. TParsedTokenList tokens;
  11. NYql::TIssues issues;
  12. Tokenize(*lexer, queryUtf8, "", tokens, issues, SQL_MAX_PARSER_ERRORS);
  13. return {tokens, issues};
  14. }
  15. TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) {
  16. TVector<TString> messages;
  17. for (const auto& issue : Tokenize(lexer, queryUtf8).second) {
  18. messages.emplace_back(issue.ToString(/* oneLine = */ true));
  19. }
  20. return messages;
  21. }
  22. TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
  23. TVector<TString> names;
  24. for (auto& token : Tokenize(lexer, queryUtf8).first) {
  25. TString view = std::move(token.Name);
  26. if (view == "ID_PLAIN" || view == "STRING_VALUE") {
  27. view.append(" (");
  28. view.append(token.Content);
  29. view.append(")");
  30. }
  31. names.emplace_back(std::move(view));
  32. }
  33. return names;
  34. }
  35. void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) {
  36. if (lhs.Name == "EOF" && rhs.Name == "EOF") {
  37. return;
  38. }
  39. UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name);
  40. UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content);
  41. UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line);
  42. }
  43. void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) {
  44. UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size());
  45. for (size_t i = 0; i < lhs.size(); ++i) {
  46. AssertEquivialent(lhs.at(i), rhs.at(i));
  47. }
  48. }
  49. Y_UNIT_TEST_SUITE(SQLv1Lexer) {
  50. Y_UNIT_TEST(AntlrVersionIndependent) {
  51. const TVector<TString> queriesUtf8 = {
  52. "",
  53. " ",
  54. "SELECT",
  55. "SEL", // identifier
  56. "SELECT FROM test",
  57. "SELECT * FROM",
  58. " SELECT * FROM ",
  59. "SELECT \"\xF0\x9F\x98\x8A\" FROM ydb",
  60. (
  61. "SELECT \"\xF0\x9F\x98\x8A Hello, друзья\", count, name\n"
  62. "FROM table -- главная таблица 数据库 \n"
  63. "WHERE count < 6\n"
  64. " AND name = \"可靠性\"\n"
  65. " AND count > 12"),
  66. "\"select\"select",
  67. };
  68. NSQLTranslationV1::TLexers lexers;
  69. lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
  70. lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
  71. auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
  72. auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
  73. for (const auto& query : queriesUtf8) {
  74. auto [tokens3, issues3] = Tokenize(lexer3, query);
  75. auto [tokens4, issues4] = Tokenize(lexer4, query);
  76. AssertEquivialent(tokens3, tokens4);
  77. UNIT_ASSERT(issues3.Empty());
  78. UNIT_ASSERT(issues4.Empty());
  79. }
  80. }
  81. TVector<TString> InvalidQueries();
  82. void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) {
  83. NSQLTranslationV1::TLexers lexers;
  84. lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
  85. lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
  86. auto lexer = MakeLexer(lexers, /* ansi = */ false, antlr4);
  87. auto input = InvalidQueries();
  88. UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size());
  89. for (size_t i = 0; i < input.size(); ++i) {
  90. UNIT_ASSERT_VALUES_EQUAL(GetTokenViews(lexer, input[i]), expected[i]);
  91. }
  92. }
  93. TVector<TString> InvalidQueries() {
  94. return {
  95. /* 0: */ "\xF0\x9F\x98\x8A",
  96. /* 1: */ "select \"aaaa",
  97. /* 2: */ "\"\\\"",
  98. /* 3: */ "\xF0\x9F\x98\x8A SELECT * FR",
  99. /* 4: */ "! SELECT * from",
  100. /* 5: */ "\xF0\x9F\x98\x8Aselect ! from",
  101. /* 6: */ "\"",
  102. /* 7: */ "!select",
  103. /* 8: */ "SELECT \\\"\xF0\x9F\x98\x8A\\\" FROM test",
  104. };
  105. }
  106. Y_UNIT_TEST(ErrorRecoveryAntlr3) {
  107. TVector<TVector<TString>> actual = {
  108. /* 0: */ {"EOF"},
  109. /* 1: */ {"SELECT", "WS", "EOF"},
  110. /* 2: */ {"EOF"},
  111. /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
  112. /* 4: */ {"ID_PLAIN (ELECT)", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
  113. /* 5: */ {"SELECT", "WS", "ID_PLAIN (rom)", "EOF"},
  114. /* 6: */ {"EOF"},
  115. /* 7: */ {"ID_PLAIN (lect)", "EOF"},
  116. /* 8: */ {"SELECT", "WS", "EOF"},
  117. };
  118. TestInvalidTokensSkipped(/* antlr4 = */ false, actual);
  119. }
  120. Y_UNIT_TEST(ErrorRecoveryAntlr4) {
  121. TVector<TVector<TString>> actual = {
  122. /* 0: */ {"EOF"},
  123. /* 1: */ {"SELECT", "WS", "EOF"},
  124. /* 2: */ {"EOF"},
  125. /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
  126. /* 4: */ {"SELECT", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
  127. /* 5: */ {"SELECT", "WS", "FROM", "EOF"},
  128. /* 6: */ {"EOF"},
  129. /* 7: */ {"ID_PLAIN (elect)", "EOF"},
  130. /* 8: */ {"SELECT", "WS", "EOF"},
  131. };
  132. TestInvalidTokensSkipped(/* antlr4 = */ true, actual);
  133. }
  134. Y_UNIT_TEST(IssuesCollected) {
  135. NSQLTranslationV1::TLexers lexers;
  136. lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
  137. lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
  138. auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
  139. auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
  140. for (const auto& query : InvalidQueries()) {
  141. auto issues3 = GetIssueMessages(lexer3, query);
  142. auto issues4 = GetIssueMessages(lexer4, query);
  143. UNIT_ASSERT(!issues3.empty());
  144. UNIT_ASSERT(!issues4.empty());
  145. }
  146. }
  147. Y_UNIT_TEST(IssueMessagesAntlr3) {
  148. NSQLTranslationV1::TLexers lexers;
  149. lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
  150. auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
  151. auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR");
  152. TVector<TString> expected = {
  153. "<main>:1:0: Error: Unexpected character '\xF0\x9F\x98\x8A' (Unicode character <128522>) : cannot match to any predicted input...",
  154. "<main>:1:1: Error: Unexpected character : cannot match to any predicted input...",
  155. "<main>:1:2: Error: Unexpected character : cannot match to any predicted input...",
  156. "<main>:1:3: Error: Unexpected character : cannot match to any predicted input...",
  157. };
  158. UNIT_ASSERT_VALUES_EQUAL(actual, expected);
  159. }
  160. Y_UNIT_TEST(IssueMessagesAntlr4) {
  161. NSQLTranslationV1::TLexers lexers;
  162. lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
  163. auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
  164. auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR");
  165. TVector<TString> expected = {
  166. "<main>:1:0: Error: token recognition error at: '\xF0\x9F\x98\x8A'",
  167. };
  168. UNIT_ASSERT_VALUES_EQUAL(actual, expected);
  169. }
  170. }