yql_ast.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. #include "yql_ast.h"
  2. #include "yql_ast_escaping.h"
  3. #include <util/string/builder.h>
  4. #include <util/system/compiler.h>
  5. #include <library/cpp/containers/stack_vector/stack_vec.h>
  6. #include <yql/essentials/utils/utf8.h>
  7. #include <cstdlib>
  8. namespace NYql {
  9. namespace {
  10. inline bool IsWhitespace(char c) {
  11. return c == ' ' || c == '\n' || c == '\r' || c == '\t';
  12. }
  13. inline bool IsListStart(char c) { return c == '('; }
  14. inline bool IsListEnd(char c) { return c == ')'; }
  15. inline bool IsCommentStart(char c) { return c == '#'; }
  16. inline bool IsQuoteStart(char c) { return c == '\''; }
  17. inline bool IsStringStart(char c) { return c == '"'; }
  18. inline bool IsHexStringStart(char c) { return c == 'x'; }
  19. inline bool IsMultilineStringStart(char c) { return c == '@'; }
  20. inline bool NeedEscaping(const TStringBuf& str) {
  21. for (char ch: str) {
  22. if (IsWhitespace(ch) || IsListStart(ch) ||
  23. IsListEnd(ch) || IsCommentStart(ch) ||
  24. IsQuoteStart(ch) || IsStringStart(ch) ||
  25. !isprint(ch & 0xff))
  26. {
  27. return true;
  28. }
  29. }
  30. return str.empty();
  31. }
  32. ///////////////////////////////////////////////////////////////////////////
  33. // TAstParser
  34. ///////////////////////////////////////////////////////////////////////////
  35. class TAstParserContext {
  36. public:
  37. inline TAstParserContext(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
  38. : Str_(str)
  39. , Position_(1, 1, file)
  40. , Offset_(0)
  41. , Pool_(externalPool)
  42. {
  43. if (!Pool_) {
  44. InnerPool_ = std::make_unique<TMemoryPool>(4096);
  45. Pool_ = InnerPool_.get();
  46. }
  47. }
  48. inline char Peek() const {
  49. Y_ABORT_UNLESS(!AtEnd());
  50. return Str_[Offset_];
  51. }
  52. inline bool AtEnd() const {
  53. return Str_.size() == Offset_;
  54. }
  55. inline char Next() {
  56. Y_ABORT_UNLESS(!AtEnd());
  57. char ch = Str_[Offset_];
  58. if (ch == '\n') {
  59. ++Position_.Row;
  60. Position_.Column = 1;
  61. } else {
  62. ++Position_.Column;
  63. }
  64. ++Offset_;
  65. return ch;
  66. }
  67. // stops right afetr stopChar
  68. inline void SeekTo(char stopChar) {
  69. while (!AtEnd() && Next() != stopChar) {
  70. // empty loop
  71. }
  72. }
  73. inline TStringBuf GetToken(ui32 begin, ui32 end) {
  74. Y_ABORT_UNLESS(end >= begin);
  75. return Str_.SubString(begin, end - begin);
  76. }
  77. inline bool IsAtomEnded() {
  78. if (AtEnd()) {
  79. return true;
  80. }
  81. char c = Peek();
  82. return IsWhitespace(c) || IsListStart(c) || IsListEnd(c);
  83. }
  84. inline const TStringBuf& Str() const { return Str_; }
  85. inline ui32 Offset() const { return Offset_; }
  86. inline const TPosition& Position() const { return Position_; }
  87. inline TMemoryPool& Pool() { return *Pool_; }
  88. inline std::unique_ptr<TMemoryPool>&& InnerPool() { return std::move(InnerPool_); }
  89. private:
  90. TStringBuf Str_;
  91. TPosition Position_;
  92. ui32 Offset_;
  93. TMemoryPool* Pool_;
  94. std::unique_ptr<TMemoryPool> InnerPool_;
  95. };
  96. ///////////////////////////////////////////////////////////////////////////
  97. // TAstParser
  98. ///////////////////////////////////////////////////////////////////////////
  99. class TAstParser {
  100. public:
  101. TAstParser(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
  102. : Ctx_(str, externalPool, file)
  103. {
  104. }
  105. TAstParseResult Parse() {
  106. TAstNode* root = nullptr;
  107. if (!IsUtf8(Ctx_.Str())) {
  108. AddError("Invalid UTF8 input");
  109. } else {
  110. root = ParseList(0U);
  111. SkipSpace();
  112. if (!Ctx_.AtEnd()) {
  113. AddError("Unexpected symbols after end of root list");
  114. }
  115. }
  116. TAstParseResult result;
  117. if (!Issues_.Empty()) {
  118. result.Issues = std::move(Issues_);
  119. } else {
  120. result.Root = root;
  121. result.Pool = Ctx_.InnerPool();
  122. }
  123. return result;
  124. }
  125. private:
  126. inline void AddError(const TString& message) {
  127. Issues_.AddIssue(Ctx_.Position(), message);
  128. }
  129. inline void SkipComment() {
  130. Ctx_.SeekTo('\n');
  131. }
  132. void SkipSpace() {
  133. while (!Ctx_.AtEnd()) {
  134. char c = Ctx_.Peek();
  135. if (IsWhitespace(c)) {
  136. Ctx_.Next();
  137. continue;
  138. }
  139. if (IsCommentStart(c)) {
  140. SkipComment();
  141. continue;
  142. }
  143. break;
  144. }
  145. }
  146. TAstNode* ParseList(size_t level) {
  147. if (level >= 1000U) {
  148. AddError("Too deep graph!");
  149. return nullptr;
  150. }
  151. SkipSpace();
  152. if (Ctx_.AtEnd()) {
  153. AddError("Unexpected end");
  154. return nullptr;
  155. }
  156. if (!IsListStart(Ctx_.Peek())) {
  157. AddError("Expected (");
  158. return nullptr;
  159. }
  160. Ctx_.Next();
  161. TSmallVec<TAstNode*> children;
  162. auto listPos = Ctx_.Position();
  163. while (true) {
  164. SkipSpace();
  165. if (Ctx_.AtEnd()) {
  166. AddError("Expected )");
  167. return nullptr;
  168. }
  169. if (IsListEnd(Ctx_.Peek())) {
  170. Ctx_.Next();
  171. return TAstNode::NewList(listPos, children.data(), children.size(), Ctx_.Pool());
  172. }
  173. TAstNode* elem = ParseElement(level);
  174. if (!elem)
  175. return nullptr;
  176. children.push_back(elem);
  177. }
  178. }
  179. TAstNode* ParseElement(size_t level) {
  180. if (Ctx_.AtEnd()) {
  181. AddError("Expected element");
  182. return nullptr;
  183. }
  184. char c = Ctx_.Peek();
  185. if (IsQuoteStart(c)) {
  186. auto resPosition = Ctx_.Position();
  187. Ctx_.Next();
  188. char ch = Ctx_.Peek();
  189. if (Ctx_.AtEnd() || IsWhitespace(ch) || IsCommentStart(ch) ||
  190. IsListEnd(ch))
  191. {
  192. AddError("Expected quotation");
  193. return nullptr;
  194. }
  195. TAstNode* content = IsListStart(ch)
  196. ? ParseList(++level)
  197. : ParseAtom();
  198. if (!content)
  199. return nullptr;
  200. return TAstNode::NewList(resPosition, Ctx_.Pool(), &TAstNode::QuoteAtom, content);
  201. }
  202. if (IsListStart(c))
  203. return ParseList(++level);
  204. return ParseAtom();
  205. }
  206. TAstNode* ParseAtom() {
  207. if (Ctx_.AtEnd()) {
  208. AddError("Expected atom");
  209. return nullptr;
  210. }
  211. auto resPosition = Ctx_.Position();
  212. ui32 atomStart = Ctx_.Offset();
  213. while (true) {
  214. char c = Ctx_.Peek();
  215. // special symbols = 0x20, 0x0a, 0x0d, 0x09 space
  216. // 0x22, 0x23, 0x28, 0x29 "#()
  217. // 0x27 '
  218. // special symbols = 0x40, 0x78 @x
  219. // &0x3f = 0x00,0x38
  220. #define MASK(x) (1ull << ui64(x))
  221. const ui64 mask1 = MASK(0x20) | MASK(0x0a) | MASK(0x0d)
  222. | MASK(0x09) | MASK(0x22) | MASK(0x23) | MASK(0x28) | MASK(0x29) | MASK(0x27);
  223. const ui64 mask2 = MASK(0x00) | MASK(0x38);
  224. #undef MASK
  225. if (!(c & 0x80) && ((1ull << (c & 0x3f)) & (c <= 0x3f ? mask1 : mask2))) {
  226. if (IsWhitespace(c) || IsListStart(c) || IsListEnd(c))
  227. break;
  228. if (IsCommentStart(c)) {
  229. AddError("Unexpected comment");
  230. return nullptr;
  231. }
  232. if (IsQuoteStart(c)) {
  233. AddError("Unexpected quotation");
  234. return nullptr;
  235. }
  236. // multiline starts with '@@'
  237. if (IsMultilineStringStart(c)) {
  238. Ctx_.Next();
  239. if (Ctx_.AtEnd()) break;
  240. if (!IsMultilineStringStart(Ctx_.Peek())) {
  241. continue;
  242. }
  243. TString token;
  244. if (!TryParseMultilineToken(token)) {
  245. return nullptr;
  246. }
  247. if (!Ctx_.IsAtomEnded()) {
  248. AddError("Unexpected end of @@");
  249. return nullptr;
  250. }
  251. return TAstNode::NewAtom(resPosition, token, Ctx_.Pool(), TNodeFlags::MultilineContent);
  252. }
  253. // hex string starts with 'x"'
  254. else if (IsHexStringStart(c)) {
  255. Ctx_.Next(); // skip 'x'
  256. if (Ctx_.AtEnd()) break;
  257. if (!IsStringStart(Ctx_.Peek())) {
  258. continue;
  259. }
  260. Ctx_.Next(); // skip first '"'
  261. size_t readBytes = 0;
  262. TStringStream ss;
  263. TStringBuf atom = Ctx_.Str().SubStr(Ctx_.Offset());
  264. EUnescapeResult unescapeResult = UnescapeBinaryAtom(
  265. atom, '"', &ss, &readBytes);
  266. // advance position
  267. while (readBytes-- != 0) {
  268. Ctx_.Next();
  269. }
  270. if (unescapeResult != EUnescapeResult::OK) {
  271. AddError(TString(UnescapeResultToString(unescapeResult)));
  272. return nullptr;
  273. }
  274. Ctx_.Next(); // skip last '"'
  275. if (!Ctx_.IsAtomEnded()) {
  276. AddError("Unexpected end of \"");
  277. return nullptr;
  278. }
  279. return TAstNode::NewAtom(resPosition, ss.Str(), Ctx_.Pool(), TNodeFlags::BinaryContent);
  280. }
  281. else if (IsStringStart(c)) {
  282. if (Ctx_.Offset() != atomStart) {
  283. AddError("Unexpected \"");
  284. return nullptr;
  285. }
  286. Ctx_.Next(); // skip first '"'
  287. size_t readBytes = 0;
  288. TStringStream ss;
  289. TStringBuf atom = Ctx_.Str().SubStr(Ctx_.Offset());
  290. EUnescapeResult unescapeResult = UnescapeArbitraryAtom(
  291. atom, '"', &ss, &readBytes);
  292. // advance position
  293. while (readBytes-- != 0) {
  294. Ctx_.Next();
  295. }
  296. if (unescapeResult != EUnescapeResult::OK) {
  297. AddError(TString(UnescapeResultToString(unescapeResult)));
  298. return nullptr;
  299. }
  300. if (!Ctx_.IsAtomEnded()) {
  301. AddError("Unexpected end of \"");
  302. return nullptr;
  303. }
  304. return TAstNode::NewAtom(resPosition, ss.Str(), Ctx_.Pool(), TNodeFlags::ArbitraryContent);
  305. }
  306. }
  307. Ctx_.Next();
  308. if (Ctx_.AtEnd()) {
  309. break;
  310. }
  311. }
  312. return TAstNode::NewAtom(resPosition, Ctx_.GetToken(atomStart, Ctx_.Offset()), Ctx_.Pool());
  313. }
  314. bool TryParseMultilineToken(TString& token) {
  315. Ctx_.Next(); // skip second '@'
  316. ui32 start = Ctx_.Offset();
  317. while (true) {
  318. Ctx_.SeekTo('@');
  319. if (Ctx_.AtEnd()) {
  320. AddError("Unexpected multiline atom end");
  321. return false;
  322. }
  323. ui32 count = 1; // we seek to first '@'
  324. while (!Ctx_.AtEnd() && Ctx_.Peek() == '@') {
  325. count++;
  326. Ctx_.Next();
  327. if (count == 4) {
  328. // Reduce each four '@' to two
  329. token.append(Ctx_.GetToken(start, Ctx_.Offset() - 2));
  330. start = Ctx_.Offset();
  331. count = 0;
  332. }
  333. }
  334. if (count >= 2) {
  335. break;
  336. }
  337. }
  338. // two '@@' at the end
  339. token.append(Ctx_.GetToken(start, Ctx_.Offset() - 2));
  340. return true;
  341. }
  342. private:
  343. TAstParserContext Ctx_;
  344. TIssues Issues_;
  345. };
  346. ///////////////////////////////////////////////////////////////////////////
  347. // ast node printing functions
  348. ///////////////////////////////////////////////////////////////////////////
  349. inline bool IsQuoteNode(const TAstNode& node) {
  350. return node.GetChildrenCount() == 2
  351. && node.GetChild(0)->GetType() == TAstNode::Atom
  352. && node.GetChild(0)->GetContent() == TStringBuf("quote");
  353. }
  354. inline bool IsBlockNode(const TAstNode& node) {
  355. return node.GetChildrenCount() == 2
  356. && node.GetChild(0)->GetType() == TAstNode::Atom
  357. && node.GetChild(0)->GetContent() == TStringBuf("block");
  358. }
  359. Y_NO_INLINE void Indent(IOutputStream& out, ui32 indentation) {
  360. char* whitespaces = reinterpret_cast<char*>(alloca(indentation));
  361. memset(whitespaces, ' ', indentation);
  362. out.Write(whitespaces, indentation);
  363. }
  364. void MultilineAtomPrint(IOutputStream& out, const TStringBuf& str) {
  365. out << TStringBuf("@@");
  366. size_t idx = str.find('@');
  367. if (idx == TString::npos) {
  368. out << str;
  369. } else {
  370. const char* begin = str.data();
  371. do {
  372. ui32 count = 0;
  373. for (; idx < str.length() && str[idx] == '@'; ++idx) {
  374. ++count;
  375. }
  376. if (count % 2 == 0) {
  377. out.Write(begin, idx - (begin - str.data()) - count);
  378. begin = str.data() + idx;
  379. while (count--) {
  380. out.Write(TStringBuf("@@"));
  381. }
  382. }
  383. idx = str.find('@', idx);
  384. } while (idx != TString::npos);
  385. out.Write(begin, str.end() - begin);
  386. }
  387. out << TStringBuf("@@");
  388. }
  389. void PrintNode(IOutputStream& out, const TAstNode& node) {
  390. if (node.GetType() == TAstNode::Atom) {
  391. if (node.GetFlags() & TNodeFlags::ArbitraryContent) {
  392. EscapeArbitraryAtom(node.GetContent(), '"', &out);
  393. } else if (node.GetFlags() & TNodeFlags::BinaryContent) {
  394. EscapeBinaryAtom(node.GetContent(), '"', &out);
  395. } else if (node.GetFlags() & TNodeFlags::MultilineContent) {
  396. MultilineAtomPrint(out, node.GetContent());
  397. } else if (node.GetContent().empty()) {
  398. EscapeArbitraryAtom(node.GetContent(), '"', &out);
  399. } else {
  400. out << node.GetContent();
  401. }
  402. } else if (node.GetType() == TAstNode::List) {
  403. if (!node.GetChildrenCount()) {
  404. out << TStringBuf("()");
  405. } else if (IsQuoteNode(node)) {
  406. out << '\'';
  407. PrintNode(out, *node.GetChild(1));
  408. } else {
  409. out << '(';
  410. ui32 index = 0;
  411. while (true) {
  412. PrintNode(out, *node.GetChild(index));
  413. ++index;
  414. if (index == node.GetChildrenCount()) break;
  415. out << ' ';
  416. }
  417. out << ')';
  418. }
  419. }
  420. }
  421. void PrettyPrintNode(
  422. IOutputStream& out, const TAstNode& node,
  423. i32 indent, i32 blockLevel, i32 localIndent, ui32 flags)
  424. {
  425. if (!(flags & TAstPrintFlags::PerLine)) {
  426. Indent(out, indent * 2);
  427. } else if (localIndent == 1) {
  428. Indent(out, blockLevel * 2);
  429. }
  430. bool isQuote = false;
  431. if (node.GetType() == TAstNode::Atom) {
  432. if (node.GetFlags() & TNodeFlags::ArbitraryContent) {
  433. if ((flags & TAstPrintFlags::AdaptArbitraryContent) &&
  434. !NeedEscaping(node.GetContent()))
  435. {
  436. out << node.GetContent();
  437. } else {
  438. EscapeArbitraryAtom(node.GetContent(), '"', &out);
  439. }
  440. } else if (node.GetFlags() & TNodeFlags::BinaryContent) {
  441. EscapeBinaryAtom(node.GetContent(), '"', &out);
  442. } else if (node.GetFlags() & TNodeFlags::MultilineContent) {
  443. MultilineAtomPrint(out, node.GetContent());
  444. } else {
  445. if (((flags & TAstPrintFlags::AdaptArbitraryContent) && NeedEscaping(node.GetContent())) ||
  446. node.GetContent().empty())
  447. {
  448. EscapeArbitraryAtom(node.GetContent(), '"', &out);
  449. } else {
  450. out << node.GetContent();
  451. }
  452. }
  453. } else if (node.GetType() == TAstNode::List) {
  454. isQuote = IsQuoteNode(node);
  455. if (isQuote && (flags & TAstPrintFlags::ShortQuote)) {
  456. out << '\'';
  457. if (localIndent == 0 || !(flags & TAstPrintFlags::PerLine)) {
  458. out << Endl;
  459. }
  460. PrettyPrintNode(out, *node.GetChild(1), indent + 1, blockLevel, localIndent + 1, flags);
  461. } else {
  462. out << '(';
  463. if (localIndent == 0 || !(flags & TAstPrintFlags::PerLine)) {
  464. out << Endl;
  465. }
  466. bool isBlock = IsBlockNode(node);
  467. for (ui32 index = 0; index < node.GetChildrenCount(); ++index) {
  468. if (localIndent > 0 && (index > 0) && (flags & TAstPrintFlags::PerLine)) {
  469. out << ' ';
  470. }
  471. if (isBlock && (index > 0)) {
  472. PrettyPrintNode(out, *node.GetChild(index), indent + 1, blockLevel + 1, -1, flags);
  473. } else {
  474. PrettyPrintNode(out, *node.GetChild(index), indent + 1, blockLevel, localIndent + 1, flags);
  475. }
  476. }
  477. }
  478. if (!isQuote || !(flags & TAstPrintFlags::ShortQuote)) {
  479. if (!(flags & TAstPrintFlags::PerLine)) {
  480. Indent(out, indent * 2);
  481. }
  482. if (localIndent == 0 && blockLevel > 0) {
  483. Indent(out, (blockLevel - 1) * 2);
  484. }
  485. out << ')';
  486. }
  487. }
  488. if (!isQuote || !(flags & TAstPrintFlags::ShortQuote)) {
  489. if (localIndent > 0 || blockLevel == 0) {
  490. if (localIndent <= 1 || !(flags & TAstPrintFlags::PerLine)) {
  491. out << Endl;
  492. }
  493. }
  494. }
  495. }
  496. void DestroyNode(TAstNode* node) {
  497. if (node->IsList()) {
  498. for (ui32 i = 0; i < node->GetChildrenCount(); ++i) {
  499. DestroyNode(node->GetChild(i));
  500. }
  501. }
  502. if (node != &TAstNode::QuoteAtom) {
  503. node->Destroy();
  504. }
  505. }
  506. } // namespace
  507. TAstParseResult::~TAstParseResult() {
  508. Destroy();
  509. }
  510. TAstParseResult::TAstParseResult(TAstParseResult&& other)
  511. : Pool(std::move(other.Pool))
  512. , Root(other.Root)
  513. , Issues(std::move(other.Issues))
  514. , PgAutoParamValues(std::move(other.PgAutoParamValues))
  515. , ActualSyntaxType(other.ActualSyntaxType)
  516. {
  517. other.Root = nullptr;
  518. }
  519. TAstParseResult& TAstParseResult::operator=(TAstParseResult&& other) {
  520. Destroy();
  521. Pool = std::move(other.Pool);
  522. Root = other.Root;
  523. other.Root = nullptr;
  524. Issues = std::move(other.Issues);
  525. PgAutoParamValues = std::move(other.PgAutoParamValues);
  526. ActualSyntaxType = other.ActualSyntaxType;
  527. return *this;
  528. }
  529. void TAstParseResult::Destroy() {
  530. if (Root) {
  531. DestroyNode(Root);
  532. Root = nullptr;
  533. }
  534. }
  535. TAstParseResult ParseAst(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
  536. {
  537. TAstParser parser(str, externalPool, file);
  538. return parser.Parse();
  539. }
  540. void TAstNode::PrintTo(IOutputStream& out) const {
  541. PrintNode(out, *this);
  542. }
  543. void TAstNode::PrettyPrintTo(IOutputStream& out, ui32 flags) const {
  544. PrettyPrintNode(out, *this, 0, 0, 0, flags);
  545. }
  546. TAstNode TAstNode::QuoteAtom(TPosition(0, 0), TStringBuf("quote"), TNodeFlags::Default);
  547. } // namespace NYql
  548. template<>
  549. void Out<NYql::TAstNode::EType>(class IOutputStream &o, NYql::TAstNode::EType x) {
  550. #define YQL_AST_NODE_TYPE_MAP_TO_STRING_IMPL(name, ...) \
  551. case ::NYql::TAstNode::name: \
  552. o << #name; \
  553. return;
  554. switch (x) {
  555. YQL_AST_NODE_TYPE_MAP(YQL_AST_NODE_TYPE_MAP_TO_STRING_IMPL)
  556. default:
  557. o << static_cast<int>(x);
  558. return;
  559. }
  560. }