lexer.cpp 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #include "lexer.h"
  2. #include <yql/essentials/public/issue/yql_issue.h>
  3. #include <yql/essentials/parser/lexer_common/lexer.h>
  4. #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
  5. #include <yql/essentials/sql/v1/lexer/antlr3_ansi/lexer.h>
  6. #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
  7. #include <yql/essentials/sql/v1/lexer/antlr4_ansi/lexer.h>
  8. #include <yql/essentials/sql/settings/translation_settings.h>
  9. #include <util/string/ascii.h>
  10. #include <util/string/builder.h>
  11. #include <util/string/strip.h>
  12. #if defined(_tsan_enabled_)
  13. #include <util/system/mutex.h>
  14. #endif
  15. namespace NSQLTranslationV1 {
  16. TLexers MakeAllLexers() {
  17. return TLexers {
  18. .Antlr3 = MakeAntlr3LexerFactory(),
  19. .Antlr3Ansi = MakeAntlr3AnsiLexerFactory(),
  20. .Antlr4 = MakeAntlr4LexerFactory(),
  21. .Antlr4Ansi = MakeAntlr4AnsiLexerFactory()
  22. };
  23. }
  24. namespace {
  25. #if defined(_tsan_enabled_)
  26. TMutex SanitizerSQLTranslationMutex;
  27. #endif
  28. using NSQLTranslation::ILexer;
  29. using NSQLTranslation::MakeDummyLexerFactory;
  30. class TV1Lexer : public ILexer {
  31. public:
  32. explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4)
  33. : Factory(GetFactory(lexers, ansi, antlr4))
  34. {
  35. }
  36. bool Tokenize(const TString& query, const TString& queryName, const TTokenCallback& onNextToken, NYql::TIssues& issues, size_t maxErrors) override {
  37. #if defined(_tsan_enabled_)
  38. TGuard<TMutex> grd(SanitizerSQLTranslationMutex);
  39. #endif
  40. return Factory->MakeLexer()->Tokenize(query, queryName, onNextToken, issues, maxErrors);
  41. }
  42. private:
  43. static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4) {
  44. if (!ansi && !antlr4) {
  45. if (lexers.Antlr3) {
  46. return lexers.Antlr3;
  47. }
  48. if (lexers.Antlr4) {
  49. return lexers.Antlr4;
  50. }
  51. return MakeDummyLexerFactory("antlr3");
  52. } else if (ansi && !antlr4) {
  53. if (lexers.Antlr3Ansi) {
  54. return lexers.Antlr3Ansi;
  55. }
  56. if (lexers.Antlr4Ansi) {
  57. return lexers.Antlr4Ansi;
  58. }
  59. return MakeDummyLexerFactory("antlr3_ansi");
  60. } else if (!ansi && antlr4) {
  61. if (lexers.Antlr4) {
  62. return lexers.Antlr4;
  63. }
  64. return MakeDummyLexerFactory("antlr4");
  65. } else {
  66. if (lexers.Antlr4Ansi) {
  67. return lexers.Antlr4Ansi;
  68. }
  69. return MakeDummyLexerFactory("antlr4_ansi");
  70. }
  71. }
  72. private:
  73. NSQLTranslation::TLexerFactoryPtr Factory;
  74. };
  75. } // namespace
  76. NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) {
  77. return NSQLTranslation::ILexer::TPtr(new TV1Lexer(MakeAllLexers(), ansi, antlr4));
  78. }
  79. NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4) {
  80. return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4));
  81. }
  82. bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
  83. return AsciiEqualsIgnoreCase(token.Name, token.Content);
  84. }
  85. using NSQLTranslation::TParsedTokenList;
  86. using TTokenIterator = TParsedTokenList::const_iterator;
  87. namespace {
  88. enum EParenType {
  89. Open,
  90. Close,
  91. None
  92. };
  93. using TAdvanceCallback = std::function<EParenType(TTokenIterator& curr, TTokenIterator end)>;
  94. TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
  95. while (curr != end && curr->Name == "WS") {
  96. ++curr;
  97. }
  98. return curr;
  99. }
  100. TTokenIterator SkipWSOrComment(TTokenIterator curr, TTokenIterator end) {
  101. while (curr != end && (curr->Name == "WS" || curr->Name == "COMMENT")) {
  102. ++curr;
  103. }
  104. return curr;
  105. }
  106. TTokenIterator SkipToNextBalanced(TTokenIterator begin, TTokenIterator end, const TAdvanceCallback& advance) {
  107. i64 level = 0;
  108. TTokenIterator curr = begin;
  109. while (curr != end) {
  110. switch (advance(curr, end)) {
  111. case EParenType::Open: {
  112. ++level;
  113. break;
  114. }
  115. case EParenType::Close: {
  116. --level;
  117. if (level < 0) {
  118. return end;
  119. } else if (level == 0) {
  120. return curr;
  121. }
  122. break;
  123. }
  124. case EParenType::None:
  125. break;
  126. }
  127. }
  128. return curr;
  129. }
  130. TTokenIterator GetNextStatementBegin(TTokenIterator begin, TTokenIterator end) {
  131. TAdvanceCallback advanceLambdaBody = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  132. Y_UNUSED(end);
  133. if (curr->Name == "LBRACE_CURLY") {
  134. ++curr;
  135. return EParenType::Open;
  136. } else if (curr->Name == "RBRACE_CURLY") {
  137. ++curr;
  138. return EParenType::Close;
  139. } else {
  140. ++curr;
  141. return EParenType::None;
  142. }
  143. };
  144. TAdvanceCallback advanceAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  145. auto tmp = curr;
  146. if (curr->Name == "DEFINE") {
  147. ++curr;
  148. curr = SkipWSOrComment(curr, end);
  149. if (curr != end && (curr->Name == "ACTION" || curr->Name == "SUBQUERY")) {
  150. ++curr;
  151. return EParenType::Open;
  152. }
  153. } else if (curr->Name == "END") {
  154. ++curr;
  155. curr = SkipWSOrComment(curr, end);
  156. if (curr != end && curr->Name == "DEFINE") {
  157. ++curr;
  158. return EParenType::Close;
  159. }
  160. }
  161. curr = tmp;
  162. ++curr;
  163. return EParenType::None;
  164. };
  165. TAdvanceCallback advanceInlineAction = [](TTokenIterator& curr, TTokenIterator end) -> EParenType {
  166. auto tmp = curr;
  167. if (curr->Name == "DO") {
  168. ++curr;
  169. curr = SkipWSOrComment(curr, end);
  170. if (curr != end && curr->Name == "BEGIN") {
  171. ++curr;
  172. return EParenType::Open;
  173. }
  174. } else if (curr->Name == "END") {
  175. ++curr;
  176. curr = SkipWSOrComment(curr, end);
  177. if (curr != end && curr->Name == "DO") {
  178. ++curr;
  179. return EParenType::Close;
  180. }
  181. }
  182. curr = tmp;
  183. ++curr;
  184. return EParenType::None;
  185. };
  186. TTokenIterator curr = begin;
  187. while (curr != end) {
  188. bool matched = false;
  189. for (auto cb : {advanceLambdaBody, advanceAction, advanceInlineAction}) {
  190. TTokenIterator tmp = curr;
  191. if (cb(tmp, end) == EParenType::Open) {
  192. curr = SkipToNextBalanced(curr, end, cb);
  193. matched = true;
  194. if (curr == end) {
  195. return curr;
  196. }
  197. }
  198. }
  199. if (matched) {
  200. continue;
  201. }
  202. if (curr->Name == "SEMICOLON") {
  203. auto next = SkipWS(curr + 1, end);
  204. while (next != end && next->Name == "COMMENT" && curr->Line == next->Line) {
  205. curr = next;
  206. next = SkipWS(next + 1, end);
  207. }
  208. ++curr;
  209. break;
  210. }
  211. ++curr;
  212. }
  213. return curr;
  214. }
  215. void SplitByStatements(TTokenIterator begin, TTokenIterator end, TVector<TTokenIterator>& output) {
  216. output.clear();
  217. if (begin == end) {
  218. return;
  219. }
  220. output.push_back(begin);
  221. auto curr = begin;
  222. while (curr != end) {
  223. curr = GetNextStatementBegin(curr, end);
  224. output.push_back(curr);
  225. }
  226. }
  227. }
  228. bool SplitQueryToStatements(const TString& query, NSQLTranslation::ILexer::TPtr& lexer, TVector<TString>& statements, NYql::TIssues& issues, const TString& file) {
  229. TParsedTokenList allTokens;
  230. auto onNextToken = [&](NSQLTranslation::TParsedToken&& token) {
  231. if (token.Name != "EOF") {
  232. allTokens.push_back(token);
  233. }
  234. };
  235. if (!lexer->Tokenize(query, file, onNextToken, issues, NSQLTranslation::SQL_MAX_PARSER_ERRORS)) {
  236. return false;
  237. }
  238. TVector<TTokenIterator> statementsTokens;
  239. SplitByStatements(allTokens.begin(), allTokens.end(), statementsTokens);
  240. for (size_t i = 1; i < statementsTokens.size(); ++i) {
  241. TStringBuilder currentQueryBuilder;
  242. for (auto it = statementsTokens[i - 1]; it != statementsTokens[i]; ++it) {
  243. currentQueryBuilder << it->Content;
  244. }
  245. TString statement = currentQueryBuilder;
  246. statement = StripStringLeft(statement);
  247. bool isBlank = true;
  248. for (auto c : statement) {
  249. if (c != ';') {
  250. isBlank = false;
  251. break;
  252. }
  253. };
  254. if (isBlank) {
  255. continue;
  256. }
  257. statements.push_back(statement);
  258. }
  259. return true;
  260. }
  261. } // namespace NSQLTranslationV1