lexer.cpp 8.0 KB


  1. #include "lexer.h"
  2. #include <yql/essentials/public/issue/yql_issue.h>
  3. #include <yql/essentials/parser/lexer_common/lexer.h>
  4. #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
  5. #include <yql/essentials/sql/v1/lexer/antlr3_ansi/lexer.h>
  6. #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
  7. #include <yql/essentials/sql/v1/lexer/antlr4_ansi/lexer.h>
  8. #include <yql/essentials/sql/settings/translation_settings.h>
  9. #include <util/string/ascii.h>
  10. #include <util/string/builder.h>
  11. #include <util/string/strip.h>
  12. #if defined(_tsan_enabled_)
  13. #include <util/system/mutex.h>
  14. #endif
  15. namespace NSQLTranslationV1 {
  16. namespace {
  17. #if defined(_tsan_enabled_)
  18. TMutex SanitizerSQLTranslationMutex;
  19. #endif
  20. using NSQLTranslation::ILexer;
  21. using NSQLTranslation::MakeDummyLexerFactory;
  22. class TV1Lexer : public ILexer {
  23. public:
  24. explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4)
  25. : Factory(GetFactory(lexers, ansi, antlr4))
  26. {
  27. }
  28. bool Tokenize(const TString& query, const TString& queryName, const TTokenCallback& onNextToken, NYql::TIssues& issues, size_t maxErrors) override {
  29. #if defined(_tsan_enabled_)
  30. TGuard<TMutex> grd(SanitizerSQLTranslationMutex);
  31. #endif
  32. return Factory->MakeLexer()->Tokenize(query, queryName, onNextToken, issues, maxErrors);
  33. }
  34. private:
  35. static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4) {
  36. if (!ansi && !antlr4) {
  37. if (lexers.Antlr3) {
  38. return lexers.Antlr3;
  39. }
  40. return MakeDummyLexerFactory("antlr3");
  41. } else if (ansi && !antlr4) {
  42. if (lexers.Antlr3Ansi) {
  43. return lexers.Antlr3Ansi;
  44. }
  45. return MakeDummyLexerFactory("antlr3_ansi");
  46. } else if (!ansi && antlr4) {
  47. if (lexers.Antlr4) {
  48. return lexers.Antlr4;
  49. }
  50. return MakeDummyLexerFactory("antlr4");
  51. } else {
  52. if (lexers.Antlr4Ansi) {
  53. return lexers.Antlr4Ansi;
  54. }
  55. return MakeDummyLexerFactory("antlr4_ansi");
  56. }
  57. }
  58. private:
  59. NSQLTranslation::TLexerFactoryPtr Factory;
  60. };
  61. } // namespace
  62. NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4) {
  63. return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4));
  64. }
  65. bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
  66. return AsciiEqualsIgnoreCase(token.Name, token.Content);
  67. }
  68. using NSQLTranslation::TParsedTokenList;
  69. using TTokenIterator = TParsedTokenList::const_iterator;
  70. namespace {
  71. enum EParenType {
  72. Open,
  73. Close,
  74. None
  75. };
  76. using TAdvanceCallback = std::function<EParenType(TTokenIterator& curr, TTokenIterator end)>;
  77. TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
  78. while (curr != end && curr->Name == "WS") {
  79. ++curr;
  80. }
  81. return curr;
  82. }
  83. TTokenIterator SkipWSOrComment(TTokenIterator curr, TTokenIterator end) {
  84. while (curr != end && (curr->Name == "WS" || curr->Name == "COMMENT")) {
  85. ++curr;
  86. }
  87. return curr;
  88. }
  89. TTokenIterator SkipToNextBalanced(TTokenIterator begin, TTokenIterator end, const TAdvanceCallback& advance) {
  90. i64 level = 0;
  91. TTokenIterator curr = begin;
  92. while (curr != end) {
  93. switch (advance(curr, end)) {
  94. case EParenType::Open: {
  95. ++level;
  96. break;
  97. }
  98. case EParenType::Close: {
  99. --level;
  100. if (level < 0) {
  101. return end;
  102. } else if (level == 0) {
  103. return curr;
  104. }
  105. break;
  106. }
  107. case EParenType::None:
  108. break;
  109. }
  110. }
  111. return curr;
  112. }
  113. TTokenIterator GetNextStatementBegin(TTokenIterator begin, TTokenIterator end) {
  114. TAdvanceCallback advanceLambdaBody = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  115. Y_UNUSED(end);
  116. if (curr->Name == "LBRACE_CURLY") {
  117. ++curr;
  118. return EParenType::Open;
  119. } else if (curr->Name == "RBRACE_CURLY") {
  120. ++curr;
  121. return EParenType::Close;
  122. } else {
  123. ++curr;
  124. return EParenType::None;
  125. }
  126. };
  127. TAdvanceCallback advanceAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  128. auto tmp = curr;
  129. if (curr->Name == "DEFINE") {
  130. ++curr;
  131. curr = SkipWSOrComment(curr, end);
  132. if (curr != end && (curr->Name == "ACTION" || curr->Name == "SUBQUERY")) {
  133. ++curr;
  134. return EParenType::Open;
  135. }
  136. } else if (curr->Name == "END") {
  137. ++curr;
  138. curr = SkipWSOrComment(curr, end);
  139. if (curr != end && curr->Name == "DEFINE") {
  140. ++curr;
  141. return EParenType::Close;
  142. }
  143. }
  144. curr = tmp;
  145. ++curr;
  146. return EParenType::None;
  147. };
  148. TAdvanceCallback advanceInlineAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  149. auto tmp = curr;
  150. if (curr->Name == "DO") {
  151. ++curr;
  152. curr = SkipWSOrComment(curr, end);
  153. if (curr != end && curr->Name == "BEGIN") {
  154. ++curr;
  155. return EParenType::Open;
  156. }
  157. } else if (curr->Name == "END") {
  158. ++curr;
  159. curr = SkipWSOrComment(curr, end);
  160. if (curr != end && curr->Name == "DO") {
  161. ++curr;
  162. return EParenType::Close;
  163. }
  164. }
  165. curr = tmp;
  166. ++curr;
  167. return EParenType::None;
  168. };
  169. TTokenIterator curr = begin;
  170. while (curr != end) {
  171. bool matched = false;
  172. for (auto cb : {advanceLambdaBody, advanceAction, advanceInlineAction}) {
  173. TTokenIterator tmp = curr;
  174. if (cb(tmp, end) == EParenType::Open) {
  175. curr = SkipToNextBalanced(curr, end, cb);
  176. matched = true;
  177. if (curr == end) {
  178. return curr;
  179. }
  180. }
  181. }
  182. if (matched) {
  183. continue;
  184. }
  185. if (curr->Name == "SEMICOLON") {
  186. auto next = SkipWS(curr + 1, end);
  187. while (next != end && next->Name == "COMMENT" && curr->Line == next->Line) {
  188. curr = next;
  189. next = SkipWS(next + 1, end);
  190. }
  191. ++curr;
  192. break;
  193. }
  194. ++curr;
  195. }
  196. return curr;
  197. }
  198. void SplitByStatements(TTokenIterator begin, TTokenIterator end, TVector<TTokenIterator>& output) {
  199. output.clear();
  200. if (begin == end) {
  201. return;
  202. }
  203. output.push_back(begin);
  204. auto curr = begin;
  205. while (curr != end) {
  206. curr = GetNextStatementBegin(curr, end);
  207. output.push_back(curr);
  208. }
  209. }
  210. }
  211. bool SplitQueryToStatements(const TString& query, NSQLTranslation::ILexer::TPtr& lexer, TVector<TString>& statements, NYql::TIssues& issues, const TString& file) {
  212. TParsedTokenList allTokens;
  213. auto onNextToken = [&](NSQLTranslation::TParsedToken&& token) {
  214. if (token.Name != "EOF") {
  215. allTokens.push_back(token);
  216. }
  217. };
  218. if (!lexer->Tokenize(query, file, onNextToken, issues, NSQLTranslation::SQL_MAX_PARSER_ERRORS)) {
  219. return false;
  220. }
  221. TVector<TTokenIterator> statementsTokens;
  222. SplitByStatements(allTokens.begin(), allTokens.end(), statementsTokens);
  223. for (size_t i = 1; i < statementsTokens.size(); ++i) {
  224. TStringBuilder currentQueryBuilder;
  225. for (auto it = statementsTokens[i - 1]; it != statementsTokens[i]; ++it) {
  226. currentQueryBuilder << it->Content;
  227. }
  228. TString statement = currentQueryBuilder;
  229. statement = StripStringLeft(statement);
  230. bool isBlank = true;
  231. for (auto c : statement) {
  232. if (c != ';') {
  233. isBlank = false;
  234. break;
  235. }
  236. };
  237. if (isBlank) {
  238. continue;
  239. }
  240. statements.push_back(statement);
  241. }
  242. return true;
  243. }
  244. } // namespace NSQLTranslationV1