lexer.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. #include "lexer.h"
  2. #include <yql/essentials/public/issue/yql_issue.h>
  3. #include <yql/essentials/parser/lexer_common/lexer.h>
  4. #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
  5. #include <yql/essentials/sql/v1/lexer/antlr3_ansi/lexer.h>
  6. #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
  7. #include <yql/essentials/sql/v1/lexer/antlr4_ansi/lexer.h>
  8. #include <yql/essentials/sql/settings/translation_settings.h>
  9. #include <util/string/ascii.h>
  10. #include <util/string/builder.h>
  11. #include <util/string/strip.h>
  12. #if defined(_tsan_enabled_)
  13. #include <util/system/mutex.h>
  14. #endif
  15. namespace NSQLTranslationV1 {
  16. TLexers MakeAllLexers() {
  17. return TLexers {
  18. .Antlr3 = MakeAntlr3LexerFactory(),
  19. .Antlr3Ansi = MakeAntlr3AnsiLexerFactory(),
  20. .Antlr4 = MakeAntlr4LexerFactory(),
  21. .Antlr4Ansi = MakeAntlr4AnsiLexerFactory()
  22. };
  23. }
  24. namespace {
  25. #if defined(_tsan_enabled_)
  26. TMutex SanitizerSQLTranslationMutex;
  27. #endif
  28. using NSQLTranslation::ILexer;
  29. using NSQLTranslation::MakeDummyLexerFactory;
  30. class TV1Lexer : public ILexer {
  31. public:
  32. explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4)
  33. : Factory(GetFactory(lexers, ansi, antlr4))
  34. {
  35. }
  36. bool Tokenize(const TString& query, const TString& queryName, const TTokenCallback& onNextToken, NYql::TIssues& issues, size_t maxErrors) override {
  37. #if defined(_tsan_enabled_)
  38. TGuard<TMutex> grd(SanitizerSQLTranslationMutex);
  39. #endif
  40. return Factory->MakeLexer()->Tokenize(query, queryName, onNextToken, issues, maxErrors);
  41. }
  42. private:
  43. static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4) {
  44. if (!ansi && !antlr4) {
  45. if (lexers.Antlr3) {
  46. return lexers.Antlr3;
  47. }
  48. return MakeDummyLexerFactory("antlr3");
  49. } else if (ansi && !antlr4) {
  50. if (lexers.Antlr3Ansi) {
  51. return lexers.Antlr3Ansi;
  52. }
  53. return MakeDummyLexerFactory("antlr3_ansi");
  54. } else if (!ansi && antlr4) {
  55. if (lexers.Antlr4) {
  56. return lexers.Antlr4;
  57. }
  58. return MakeDummyLexerFactory("antlr4");
  59. } else {
  60. if (lexers.Antlr4Ansi) {
  61. return lexers.Antlr4Ansi;
  62. }
  63. return MakeDummyLexerFactory("antlr4_ansi");
  64. }
  65. }
  66. private:
  67. NSQLTranslation::TLexerFactoryPtr Factory;
  68. };
  69. } // namespace
  70. NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) {
  71. return NSQLTranslation::ILexer::TPtr(new TV1Lexer(MakeAllLexers(), ansi, antlr4));
  72. }
  73. NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4) {
  74. return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4));
  75. }
  76. bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
  77. return AsciiEqualsIgnoreCase(token.Name, token.Content);
  78. }
  79. using NSQLTranslation::TParsedTokenList;
  80. using TTokenIterator = TParsedTokenList::const_iterator;
  81. namespace {
  82. enum EParenType {
  83. Open,
  84. Close,
  85. None
  86. };
  87. using TAdvanceCallback = std::function<EParenType(TTokenIterator& curr, TTokenIterator end)>;
  88. TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
  89. while (curr != end && curr->Name == "WS") {
  90. ++curr;
  91. }
  92. return curr;
  93. }
  94. TTokenIterator SkipWSOrComment(TTokenIterator curr, TTokenIterator end) {
  95. while (curr != end && (curr->Name == "WS" || curr->Name == "COMMENT")) {
  96. ++curr;
  97. }
  98. return curr;
  99. }
  100. TTokenIterator SkipToNextBalanced(TTokenIterator begin, TTokenIterator end, const TAdvanceCallback& advance) {
  101. i64 level = 0;
  102. TTokenIterator curr = begin;
  103. while (curr != end) {
  104. switch (advance(curr, end)) {
  105. case EParenType::Open: {
  106. ++level;
  107. break;
  108. }
  109. case EParenType::Close: {
  110. --level;
  111. if (level < 0) {
  112. return end;
  113. } else if (level == 0) {
  114. return curr;
  115. }
  116. break;
  117. }
  118. case EParenType::None:
  119. break;
  120. }
  121. }
  122. return curr;
  123. }
  124. TTokenIterator GetNextStatementBegin(TTokenIterator begin, TTokenIterator end) {
  125. TAdvanceCallback advanceLambdaBody = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  126. Y_UNUSED(end);
  127. if (curr->Name == "LBRACE_CURLY") {
  128. ++curr;
  129. return EParenType::Open;
  130. } else if (curr->Name == "RBRACE_CURLY") {
  131. ++curr;
  132. return EParenType::Close;
  133. } else {
  134. ++curr;
  135. return EParenType::None;
  136. }
  137. };
  138. TAdvanceCallback advanceAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  139. auto tmp = curr;
  140. if (curr->Name == "DEFINE") {
  141. ++curr;
  142. curr = SkipWSOrComment(curr, end);
  143. if (curr != end && (curr->Name == "ACTION" || curr->Name == "SUBQUERY")) {
  144. ++curr;
  145. return EParenType::Open;
  146. }
  147. } else if (curr->Name == "END") {
  148. ++curr;
  149. curr = SkipWSOrComment(curr, end);
  150. if (curr != end && curr->Name == "DEFINE") {
  151. ++curr;
  152. return EParenType::Close;
  153. }
  154. }
  155. curr = tmp;
  156. ++curr;
  157. return EParenType::None;
  158. };
  159. TAdvanceCallback advanceInlineAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  160. auto tmp = curr;
  161. if (curr->Name == "DO") {
  162. ++curr;
  163. curr = SkipWSOrComment(curr, end);
  164. if (curr != end && curr->Name == "BEGIN") {
  165. ++curr;
  166. return EParenType::Open;
  167. }
  168. } else if (curr->Name == "END") {
  169. ++curr;
  170. curr = SkipWSOrComment(curr, end);
  171. if (curr != end && curr->Name == "DO") {
  172. ++curr;
  173. return EParenType::Close;
  174. }
  175. }
  176. curr = tmp;
  177. ++curr;
  178. return EParenType::None;
  179. };
  180. TTokenIterator curr = begin;
  181. while (curr != end) {
  182. bool matched = false;
  183. for (auto cb : {advanceLambdaBody, advanceAction, advanceInlineAction}) {
  184. TTokenIterator tmp = curr;
  185. if (cb(tmp, end) == EParenType::Open) {
  186. curr = SkipToNextBalanced(curr, end, cb);
  187. matched = true;
  188. if (curr == end) {
  189. return curr;
  190. }
  191. }
  192. }
  193. if (matched) {
  194. continue;
  195. }
  196. if (curr->Name == "SEMICOLON") {
  197. auto next = SkipWS(curr + 1, end);
  198. while (next != end && next->Name == "COMMENT" && curr->Line == next->Line) {
  199. curr = next;
  200. next = SkipWS(next + 1, end);
  201. }
  202. ++curr;
  203. break;
  204. }
  205. ++curr;
  206. }
  207. return curr;
  208. }
  209. void SplitByStatements(TTokenIterator begin, TTokenIterator end, TVector<TTokenIterator>& output) {
  210. output.clear();
  211. if (begin == end) {
  212. return;
  213. }
  214. output.push_back(begin);
  215. auto curr = begin;
  216. while (curr != end) {
  217. curr = GetNextStatementBegin(curr, end);
  218. output.push_back(curr);
  219. }
  220. }
  221. }
  222. bool SplitQueryToStatements(const TString& query, NSQLTranslation::ILexer::TPtr& lexer, TVector<TString>& statements, NYql::TIssues& issues, const TString& file) {
  223. TParsedTokenList allTokens;
  224. auto onNextToken = [&](NSQLTranslation::TParsedToken&& token) {
  225. if (token.Name != "EOF") {
  226. allTokens.push_back(token);
  227. }
  228. };
  229. if (!lexer->Tokenize(query, file, onNextToken, issues, NSQLTranslation::SQL_MAX_PARSER_ERRORS)) {
  230. return false;
  231. }
  232. TVector<TTokenIterator> statementsTokens;
  233. SplitByStatements(allTokens.begin(), allTokens.end(), statementsTokens);
  234. for (size_t i = 1; i < statementsTokens.size(); ++i) {
  235. TStringBuilder currentQueryBuilder;
  236. for (auto it = statementsTokens[i - 1]; it != statementsTokens[i]; ++it) {
  237. currentQueryBuilder << it->Content;
  238. }
  239. TString statement = currentQueryBuilder;
  240. statement = StripStringLeft(statement);
  241. bool isBlank = true;
  242. for (auto c : statement) {
  243. if (c != ';') {
  244. isBlank = false;
  245. break;
  246. }
  247. };
  248. if (isBlank) {
  249. continue;
  250. }
  251. statements.push_back(statement);
  252. }
  253. return true;
  254. }
  255. } // namespace NSQLTranslationV1