lexer.cpp 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. #include "lexer.h"
  2. #include <yql/essentials/public/issue/yql_issue.h>
  3. #include <yql/essentials/parser/proto_ast/collect_issues/collect_issues.h>
  4. #include <yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h>
  5. #include <yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h>
  6. #include <yql/essentials/parser/proto_ast/gen/v1/SQLv1Lexer.h>
  7. #include <yql/essentials/parser/proto_ast/gen/v1_ansi/SQLv1Lexer.h>
  8. #include <yql/essentials/parser/proto_ast/gen/v1_antlr4/SQLv1Antlr4Lexer.h>
  9. #include <yql/essentials/parser/proto_ast/gen/v1_ansi_antlr4/SQLv1Antlr4Lexer.h>
  10. #include <yql/essentials/sql/v1/sql.h>
  11. #include <util/string/ascii.h>
  12. #include <util/string/builder.h>
  13. #include <util/string/strip.h>
  14. #if defined(_tsan_enabled_)
  15. #include <util/system/mutex.h>
  16. #endif
  17. namespace NALPDefault {
  18. extern ANTLR_UINT8 *SQLv1ParserTokenNames[];
  19. }
  20. namespace NALPAnsi {
  21. extern ANTLR_UINT8 *SQLv1ParserTokenNames[];
  22. }
  23. namespace NSQLTranslationV1 {
  24. namespace {
  25. #if defined(_tsan_enabled_)
  26. TMutex SanitizerSQLTranslationMutex;
  27. #endif
  28. using NSQLTranslation::ILexer;
  29. class TV1Lexer : public ILexer {
  30. public:
  31. explicit TV1Lexer(bool ansi, bool antlr4)
  32. : Ansi(ansi), Antlr4(antlr4)
  33. {
  34. }
  35. bool Tokenize(const TString& query, const TString& queryName, const TTokenCallback& onNextToken, NYql::TIssues& issues, size_t maxErrors) override {
  36. NYql::TIssues newIssues;
  37. #if defined(_tsan_enabled_)
  38. TGuard<TMutex> grd(SanitizerSQLTranslationMutex);
  39. #endif
  40. NSQLTranslation::TErrorCollectorOverIssues collector(newIssues, maxErrors, "");
  41. if (Ansi && !Antlr4) {
  42. NProtoAST::TLexerTokensCollector3<NALPAnsi::SQLv1Lexer> tokensCollector(query, (const char**)NALPAnsi::SQLv1ParserTokenNames, queryName);
  43. tokensCollector.CollectTokens(collector, onNextToken);
  44. } else if (!Ansi && !Antlr4) {
  45. NProtoAST::TLexerTokensCollector3<NALPDefault::SQLv1Lexer> tokensCollector(query, (const char**)NALPDefault::SQLv1ParserTokenNames, queryName);
  46. tokensCollector.CollectTokens(collector, onNextToken);
  47. } else if (Ansi && Antlr4) {
  48. NProtoAST::TLexerTokensCollector4<NALPAnsiAntlr4::SQLv1Antlr4Lexer> tokensCollector(query, queryName);
  49. tokensCollector.CollectTokens(collector, onNextToken);
  50. } else {
  51. NProtoAST::TLexerTokensCollector4<NALPDefaultAntlr4::SQLv1Antlr4Lexer> tokensCollector(query, queryName);
  52. tokensCollector.CollectTokens(collector, onNextToken);
  53. }
  54. issues.AddIssues(newIssues);
  55. return !AnyOf(newIssues.begin(), newIssues.end(), [](auto issue) { return issue.GetSeverity() == NYql::ESeverity::TSeverityIds_ESeverityId_S_ERROR; });
  56. }
  57. private:
  58. const bool Ansi;
  59. const bool Antlr4;
  60. };
  61. } // namespace
  62. NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) {
  63. return NSQLTranslation::ILexer::TPtr(new TV1Lexer(ansi, antlr4));
  64. }
  65. bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
  66. return AsciiEqualsIgnoreCase(token.Name, token.Content);
  67. }
  68. using NSQLTranslation::TParsedTokenList;
  69. using TTokenIterator = TParsedTokenList::const_iterator;
  70. namespace {
  71. enum EParenType {
  72. Open,
  73. Close,
  74. None
  75. };
  76. using TAdvanceCallback = std::function<EParenType(TTokenIterator& curr, TTokenIterator end)>;
  77. TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
  78. while (curr != end && curr->Name == "WS") {
  79. ++curr;
  80. }
  81. return curr;
  82. }
  83. TTokenIterator SkipWSOrComment(TTokenIterator curr, TTokenIterator end) {
  84. while (curr != end && (curr->Name == "WS" || curr->Name == "COMMENT")) {
  85. ++curr;
  86. }
  87. return curr;
  88. }
  89. TTokenIterator SkipToNextBalanced(TTokenIterator begin, TTokenIterator end, const TAdvanceCallback& advance) {
  90. i64 level = 0;
  91. TTokenIterator curr = begin;
  92. while (curr != end) {
  93. switch (advance(curr, end)) {
  94. case EParenType::Open: {
  95. ++level;
  96. break;
  97. }
  98. case EParenType::Close: {
  99. --level;
  100. if (level < 0) {
  101. return end;
  102. } else if (level == 0) {
  103. return curr;
  104. }
  105. break;
  106. }
  107. case EParenType::None:
  108. break;
  109. }
  110. }
  111. return curr;
  112. }
  113. TTokenIterator GetNextStatementBegin(TTokenIterator begin, TTokenIterator end) {
  114. TAdvanceCallback advanceLambdaBody = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  115. Y_UNUSED(end);
  116. if (curr->Name == "LBRACE_CURLY") {
  117. ++curr;
  118. return EParenType::Open;
  119. } else if (curr->Name == "RBRACE_CURLY") {
  120. ++curr;
  121. return EParenType::Close;
  122. } else {
  123. ++curr;
  124. return EParenType::None;
  125. }
  126. };
  127. TAdvanceCallback advanceAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  128. auto tmp = curr;
  129. if (curr->Name == "DEFINE") {
  130. ++curr;
  131. curr = SkipWSOrComment(curr, end);
  132. if (curr != end && (curr->Name == "ACTION" || curr->Name == "SUBQUERY")) {
  133. ++curr;
  134. return EParenType::Open;
  135. }
  136. } else if (curr->Name == "END") {
  137. ++curr;
  138. curr = SkipWSOrComment(curr, end);
  139. if (curr != end && curr->Name == "DEFINE") {
  140. ++curr;
  141. return EParenType::Close;
  142. }
  143. }
  144. curr = tmp;
  145. ++curr;
  146. return EParenType::None;
  147. };
  148. TAdvanceCallback advanceInlineAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  149. auto tmp = curr;
  150. if (curr->Name == "DO") {
  151. ++curr;
  152. curr = SkipWSOrComment(curr, end);
  153. if (curr != end && curr->Name == "BEGIN") {
  154. ++curr;
  155. return EParenType::Open;
  156. }
  157. } else if (curr->Name == "END") {
  158. ++curr;
  159. curr = SkipWSOrComment(curr, end);
  160. if (curr != end && curr->Name == "DO") {
  161. ++curr;
  162. return EParenType::Close;
  163. }
  164. }
  165. curr = tmp;
  166. ++curr;
  167. return EParenType::None;
  168. };
  169. TTokenIterator curr = begin;
  170. while (curr != end) {
  171. bool matched = false;
  172. for (auto cb : {advanceLambdaBody, advanceAction, advanceInlineAction}) {
  173. TTokenIterator tmp = curr;
  174. if (cb(tmp, end) == EParenType::Open) {
  175. curr = SkipToNextBalanced(curr, end, cb);
  176. matched = true;
  177. if (curr == end) {
  178. return curr;
  179. }
  180. }
  181. }
  182. if (matched) {
  183. continue;
  184. }
  185. if (curr->Name == "SEMICOLON") {
  186. auto next = SkipWS(curr + 1, end);
  187. while (next != end && next->Name == "COMMENT" && curr->Line == next->Line) {
  188. curr = next;
  189. next = SkipWS(next + 1, end);
  190. }
  191. ++curr;
  192. break;
  193. }
  194. ++curr;
  195. }
  196. return curr;
  197. }
  198. void SplitByStatements(TTokenIterator begin, TTokenIterator end, TVector<TTokenIterator>& output) {
  199. output.clear();
  200. if (begin == end) {
  201. return;
  202. }
  203. output.push_back(begin);
  204. auto curr = begin;
  205. while (curr != end) {
  206. curr = GetNextStatementBegin(curr, end);
  207. output.push_back(curr);
  208. }
  209. }
  210. }
  211. bool SplitQueryToStatements(const TString& query, NSQLTranslation::ILexer::TPtr& lexer, TVector<TString>& statements, NYql::TIssues& issues) {
  212. TParsedTokenList allTokens;
  213. auto onNextToken = [&](NSQLTranslation::TParsedToken&& token) {
  214. if (token.Name != "EOF") {
  215. allTokens.push_back(token);
  216. }
  217. };
  218. if (!lexer->Tokenize(query, "Query", onNextToken, issues, NSQLTranslation::SQL_MAX_PARSER_ERRORS)) {
  219. return false;
  220. }
  221. TVector<TTokenIterator> statementsTokens;
  222. SplitByStatements(allTokens.begin(), allTokens.end(), statementsTokens);
  223. for (size_t i = 1; i < statementsTokens.size(); ++i) {
  224. TStringBuilder currentQueryBuilder;
  225. for (auto it = statementsTokens[i - 1]; it != statementsTokens[i]; ++it) {
  226. currentQueryBuilder << it->Content;
  227. }
  228. TString statement = currentQueryBuilder;
  229. statement = StripStringLeft(statement);
  230. bool isBlank = true;
  231. for (auto c : statement) {
  232. if (c != ';') {
  233. isBlank = false;
  234. break;
  235. }
  236. };
  237. if (isBlank) {
  238. continue;
  239. }
  240. statements.push_back(statement);
  241. }
  242. return true;
  243. }
  244. } // namespace NSQLTranslationV1