lexer_ut.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. #include "lexer.h"
  2. #include <yql/essentials/core/issue/yql_issue.h>
  3. #include <yql/essentials/sql/settings/translation_settings.h>
  4. #include <library/cpp/testing/unittest/registar.h>
  5. using namespace NSQLTranslation;
  6. using namespace NSQLTranslationV1;
  7. std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) {
  8. TParsedTokenList tokens;
  9. NYql::TIssues issues;
  10. Tokenize(*lexer, queryUtf8, "Query", tokens, issues, SQL_MAX_PARSER_ERRORS);
  11. return {tokens, issues};
  12. }
  13. TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) {
  14. TVector<TString> messages;
  15. for (const auto& issue : Tokenize(lexer, queryUtf8).second) {
  16. messages.emplace_back(issue.ToString(/* oneLine = */ true));
  17. }
  18. return messages;
  19. }
  20. TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
  21. TVector<TString> names;
  22. for (auto& token : Tokenize(lexer, queryUtf8).first) {
  23. TString view = std::move(token.Name);
  24. if (view == "ID_PLAIN" || view == "STRING_VALUE") {
  25. view.append(" (");
  26. view.append(token.Content);
  27. view.append(")");
  28. }
  29. names.emplace_back(std::move(view));
  30. }
  31. return names;
  32. }
  33. void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) {
  34. if (lhs.Name == "EOF" && rhs.Name == "EOF") {
  35. return;
  36. }
  37. UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name);
  38. UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content);
  39. UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line);
  40. }
  41. void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) {
  42. UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size());
  43. for (size_t i = 0; i < lhs.size(); ++i) {
  44. AssertEquivialent(lhs.at(i), rhs.at(i));
  45. }
  46. }
  47. Y_UNIT_TEST_SUITE(SQLv1Lexer) {
  48. Y_UNIT_TEST(AntlrVersionIndependent) {
  49. const TVector<TString> queriesUtf8 = {
  50. "",
  51. " ",
  52. "SELECT",
  53. "SEL", // identifier
  54. "SELECT FROM test",
  55. "SELECT * FROM",
  56. " SELECT * FROM ",
  57. "SELECT \"\xF0\x9F\x98\x8A\" FROM ydb",
  58. (
  59. "SELECT \"\xF0\x9F\x98\x8A Hello, друзья\", count, name\n"
  60. "FROM table -- главная таблица 数据库 \n"
  61. "WHERE count < 6\n"
  62. " AND name = \"可靠性\"\n"
  63. " AND count > 12"),
  64. "\"select\"select",
  65. };
  66. auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
  67. auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
  68. for (const auto& query : queriesUtf8) {
  69. auto [tokens3, issues3] = Tokenize(lexer3, query);
  70. auto [tokens4, issues4] = Tokenize(lexer4, query);
  71. AssertEquivialent(tokens3, tokens4);
  72. UNIT_ASSERT(issues3.Empty());
  73. UNIT_ASSERT(issues4.Empty());
  74. }
  75. }
  76. TVector<TString> InvalidQueries();
  77. void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) {
  78. auto lexer = MakeLexer(/* ansi = */ false, antlr4);
  79. auto input = InvalidQueries();
  80. UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size());
  81. for (size_t i = 0; i < input.size(); ++i) {
  82. UNIT_ASSERT_VALUES_EQUAL(GetTokenViews(lexer, input[i]), expected[i]);
  83. }
  84. }
  85. TVector<TString> InvalidQueries() {
  86. return {
  87. /* 0: */ "\xF0\x9F\x98\x8A",
  88. /* 1: */ "select \"aaaa",
  89. /* 2: */ "\"\\\"",
  90. /* 3: */ "\xF0\x9F\x98\x8A SELECT * FR",
  91. /* 4: */ "! SELECT * from",
  92. /* 5: */ "\xF0\x9F\x98\x8Aselect ! from",
  93. /* 6: */ "\"",
  94. /* 7: */ "!select",
  95. /* 8: */ "SELECT \\\"\xF0\x9F\x98\x8A\\\" FROM test",
  96. };
  97. }
  98. Y_UNIT_TEST(ErrorRecoveryAntlr3) {
  99. TVector<TVector<TString>> actual = {
  100. /* 0: */ {"EOF"},
  101. /* 1: */ {"SELECT", "WS", "EOF"},
  102. /* 2: */ {"EOF"},
  103. /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
  104. /* 4: */ {"ID_PLAIN (ELECT)", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
  105. /* 5: */ {"SELECT", "WS", "ID_PLAIN (rom)", "EOF"},
  106. /* 6: */ {"EOF"},
  107. /* 7: */ {"ID_PLAIN (lect)", "EOF"},
  108. /* 8: */ {"SELECT", "WS", "EOF"},
  109. };
  110. TestInvalidTokensSkipped(/* antlr4 = */ false, actual);
  111. }
  112. Y_UNIT_TEST(ErrorRecoveryAntlr4) {
  113. TVector<TVector<TString>> actual = {
  114. /* 0: */ {"EOF"},
  115. /* 1: */ {"SELECT", "WS", "EOF"},
  116. /* 2: */ {"EOF"},
  117. /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
  118. /* 4: */ {"SELECT", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
  119. /* 5: */ {"SELECT", "WS", "FROM", "EOF"},
  120. /* 6: */ {"EOF"},
  121. /* 7: */ {"ID_PLAIN (elect)", "EOF"},
  122. /* 8: */ {"SELECT", "WS", "EOF"},
  123. };
  124. TestInvalidTokensSkipped(/* antlr4 = */ true, actual);
  125. }
  126. Y_UNIT_TEST(IssuesCollected) {
  127. auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
  128. auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
  129. for (const auto& query : InvalidQueries()) {
  130. auto issues3 = GetIssueMessages(lexer3, query);
  131. auto issues4 = GetIssueMessages(lexer4, query);
  132. UNIT_ASSERT(!issues3.empty());
  133. UNIT_ASSERT(!issues4.empty());
  134. }
  135. }
  136. Y_UNIT_TEST(IssueMessagesAntlr3) {
  137. auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
  138. auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR");
  139. TVector<TString> expected = {
  140. "<main>:1:0: Error: Unexpected character '\xF0\x9F\x98\x8A' (Unicode character <128522>) : cannot match to any predicted input...",
  141. "<main>:1:1: Error: Unexpected character : cannot match to any predicted input...",
  142. "<main>:1:2: Error: Unexpected character : cannot match to any predicted input...",
  143. "<main>:1:3: Error: Unexpected character : cannot match to any predicted input...",
  144. };
  145. UNIT_ASSERT_VALUES_EQUAL(actual, expected);
  146. }
  147. Y_UNIT_TEST(IssueMessagesAntlr4) {
  148. auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
  149. auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR");
  150. TVector<TString> expected = {
  151. "<main>:1:0: Error: token recognition error at: '\xF0\x9F\x98\x8A'",
  152. };
  153. UNIT_ASSERT_VALUES_EQUAL(actual, expected);
  154. }
  155. }