Browse Source

Moved yql/ast YQL-19206

init
commit_hash:a6a63582073784b9318cc04ffcc1e212f3df703b
vvvv 4 months ago
parent
commit
8cf98f8169

+ 13 - 0
yql/essentials/ast/serialize/ya.make

@@ -0,0 +1,13 @@
+LIBRARY()
+
+SRCS(
+    yql_expr_serialize.cpp
+)
+
+PEERDIR(
+    yql/essentials/ast
+    yql/essentials/core/issue
+    contrib/ydb/library/yql/minikql
+)
+
+END()

+ 499 - 0
yql/essentials/ast/serialize/yql_expr_serialize.cpp

@@ -0,0 +1,499 @@
+#include "yql_expr_serialize.h"
+#include <contrib/ydb/library/yql/minikql/pack_num.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/deque.h>
+
+namespace NYql {
+
+namespace {
+
+enum ESerializeCommands {
+    NODE_REF = 0x00,
+    NODE_VALUE = 0x10,
+    INLINE_STR = 0x08, // string is unique, don't write it to the pool
+    SAME_POSITION = 0x40,
+    ATOM_FLAG = 0x20,
+    WIDE = 0x80, // mark wide lambdas
+    ATOM = ATOM_FLAG | NODE_VALUE,  // for atoms we will use TNodeFlags bits (1/2/4)
+    LIST = TExprNode::List | NODE_VALUE,
+    CALLABLE = TExprNode::Callable | NODE_VALUE,
+    LAMBDA = TExprNode::Lambda | NODE_VALUE,
+    ARGUMENT = TExprNode::Argument | NODE_VALUE,
+    ARGUMENTS = TExprNode::Arguments | NODE_VALUE,
+    WORLD = TExprNode::World | NODE_VALUE,
+};
+
+using namespace NKikimr;
+
+class TWriter {
+public:
+    TWriter(TExprContext& ctx, ui16 components)
+        : Ctx(ctx)
+        , Components_(components)
+    {
+    }
+
+    const TString& Out() const {
+        //Cerr << "Nodes:" << WrittenNodes_.size() << ", pos: " << Positions_.size() << ", bytes: " << Out_.size() << "\n";
+        return Out_;
+    }
+
+    void Prepare(const TExprNode& node) {
+        TNodeSet visited;
+        PrepareImpl(node, visited);
+    }
+
+    void Init() {
+        WriteVar32(Components_);
+        ui32 reusedStringCount = 0;
+        for (auto& x : StringCounters_) {
+            if (x.second.first > 1) {
+                x.second.second = reusedStringCount;
+                ++reusedStringCount;
+            }
+        }
+
+        WriteVar32(reusedStringCount);
+        TVector<std::pair<TStringBuf, ui32>> sortedStrings;
+        sortedStrings.reserve(reusedStringCount);
+        for (const auto& x : StringCounters_) {
+            if (x.second.first > 1) {
+                sortedStrings.push_back({ x.first, x.second.second });
+            }
+        }
+
+        Sort(sortedStrings.begin(), sortedStrings.end(), [](const auto& x, const auto& y) { return x.second < y.second; });
+
+        for (const auto& x : sortedStrings) {
+            WriteVar32(x.first.length());
+            WriteMany(x.first.data(), x.first.length());
+        }
+
+        if (Components_ & TSerializedExprGraphComponents::Positions) {
+            WriteVar32(Files_.size());
+            TVector<std::pair<TStringBuf, ui32>> sortedFiles;
+            sortedFiles.reserve(Files_.size());
+            for (const auto& x : Files_) {
+                sortedFiles.push_back({ x.first, x.second });
+            }
+
+            Sort(sortedFiles.begin(), sortedFiles.end(), [](const auto& x, const auto& y) { return x.second < y.second; });
+            for (const auto& x : sortedFiles) {
+                WriteVar32(x.first.length());
+                WriteMany(x.first.data(), x.first.length());
+            }
+
+            WriteVar32(Positions_.size());
+            TVector<std::tuple<ui32, ui32, ui32, ui32>> sortedPositions;
+            sortedPositions.reserve(Positions_.size());
+            for (const auto& x : Positions_) {
+                sortedPositions.push_back({ std::get<0>(x.first), std::get<1>(x.first), std::get<2>(x.first), x.second });
+            }
+
+            Sort(sortedPositions.begin(), sortedPositions.end(), [](const auto& x, const auto& y)
+                { return std::get<3>(x) < std::get<3>(y); });
+
+            for (const auto& x : sortedPositions) {
+                WriteVar32(std::get<0>(x));
+                WriteVar32(std::get<1>(x));
+                WriteVar32(std::get<2>(x));
+            }
+        }
+    }
+
+    void Save(const TExprNode& node) {
+        auto writtenIt = WrittenNodes_.find(&node);
+        if (writtenIt != WrittenNodes_.end()) {
+            Write(NODE_REF);
+            WriteVar32(writtenIt->second);
+            return;
+        }
+
+        char command = (node.Type() == TExprNode::Atom) ? ATOM : ((node.Type() & TExprNode::TypeMask) | NODE_VALUE);
+
+        if (node.Type() == TExprNode::Lambda && node.ChildrenSize() > 2U) {
+            command |= WIDE;
+        }
+
+        if (Components_ & TSerializedExprGraphComponents::Positions) {
+            // will write position
+            if (Ctx.GetPosition(node.Pos()) == LastPosition_) {
+                command |= SAME_POSITION;
+            }
+        }
+
+        if (node.Type() == TExprNode::Atom) {
+            command |= (TNodeFlags::FlagsMask & node.Flags());
+        }
+
+        ui32 strNum = 0;
+        if (node.Type() == TExprNode::Atom || node.Type() == TExprNode::Callable || node.Type() == TExprNode::Argument) {
+            auto strIt = StringCounters_.find(node.Content());
+            YQL_ENSURE(strIt != StringCounters_.end());
+            if (strIt->second.first == 1) {
+                command |= INLINE_STR;
+            } else {
+                strNum = strIt->second.second;
+            }
+        }
+
+        Write(command);
+        if ((Components_ & TSerializedExprGraphComponents::Positions) && !(command & SAME_POSITION)) {
+            const auto& pos = Ctx.GetPosition(node.Pos());
+            ui32 fileNum = 0;
+            if (pos.File) {
+                auto fileIt = Files_.find(pos.File);
+                YQL_ENSURE(fileIt != Files_.end());
+                fileNum = fileIt->second;
+            }
+
+            auto posIt = Positions_.find(std::make_tuple(std::move(pos.Row), std::move(pos.Column),
+                std::move(fileNum)));
+            YQL_ENSURE(posIt != Positions_.end());
+            WriteVar32(posIt->second);
+            LastPosition_ = pos;
+        }
+
+        if (node.Type() == TExprNode::Atom || node.Type() == TExprNode::Callable || node.Type() == TExprNode::Argument) {
+            if (command & INLINE_STR) {
+                WriteVar32(node.Content().length());
+                WriteMany(node.Content().data(), node.Content().length());
+            } else {
+                WriteVar32(strNum);
+            }
+        }
+
+        if (node.Type() == TExprNode::Callable || node.Type() == TExprNode::Arguments || node.Type() == TExprNode::List || (node.Type() == TExprNode::Lambda && node.ChildrenSize() > 2U)) {
+            WriteVar32(node.ChildrenSize());
+        }
+
+        for (const auto& x : node.Children()) {
+            Save(*x);
+        }
+
+        WrittenNodes_.emplace(&node, 1 + WrittenNodes_.size());
+    }
+
+private:
+    void PrepareImpl(const TExprNode& node, TNodeSet& visited) {
+        if (!visited.emplace(&node).second) {
+            return;
+        }
+
+        if (Components_ & TSerializedExprGraphComponents::Positions) {
+            const auto& pos = Ctx.GetPosition(node.Pos());
+            const auto& file = pos.File;
+            ui32 fileNum = 0;
+            if (file) {
+                fileNum = Files_.emplace(file, 1 + (ui32)Files_.size()).first->second;
+            }
+
+            Positions_.emplace(std::make_tuple(std::move(pos.Row), std::move(pos.Column),
+                std::move(fileNum)), (ui32)Positions_.size());
+        }
+
+        if (node.IsAtom() || node.IsCallable() || node.Type() == TExprNode::Argument) {
+            auto& x = StringCounters_[node.Content()];
+            x.first++;
+        }
+
+        for (const auto& x : node.Children()) {
+            PrepareImpl(*x, visited);
+        }
+    }
+
+    Y_FORCE_INLINE void Write(char c) {
+        Out_.append(c);
+    }
+
+    Y_FORCE_INLINE void WriteMany(const void* buf, size_t len) {
+        Out_.AppendNoAlias((const char*)buf, len);
+    }
+
+    Y_FORCE_INLINE void WriteVar32(ui32 value) {
+        char buf[MAX_PACKED32_SIZE];
+        Out_.AppendNoAlias(buf, Pack32(value, buf));
+    }
+
+private:
+    TExprContext& Ctx;
+    const ui16 Components_;
+    THashMap<TStringBuf, ui32> Files_;
+    THashMap<std::tuple<ui32, ui32, ui32>, ui32> Positions_;
+    THashMap<TStringBuf, std::pair<ui32, ui32>> StringCounters_; // str -> id + serialized id
+
+    TNodeMap<ui32> WrittenNodes_;
+    TPosition LastPosition_;
+
+    TString Out_;
+};
+
+class TReader {
+public:
+    TReader(TPosition pos, TStringBuf buffer, TExprContext& ctx)
+        : Pos_(pos)
+        , Current_(buffer.data())
+        , End_(buffer.data() + buffer.size())
+        , Ctx_(ctx)
+        , Components_(0)
+    {
+    }
+
+    TExprNode::TPtr Load() {
+        try {
+            Components_ = ReadVar32();
+            auto reusedStringCount = ReadVar32();
+            Strings_.reserve(reusedStringCount);
+            for (ui32 i = 0; i < reusedStringCount; ++i) {
+                ui32 length = ReadVar32();
+                auto internedBuf = Ctx_.AppendString(TStringBuf(ReadMany(length), length));
+                Strings_.push_back(internedBuf);
+            }
+
+            if (Components_ & TSerializedExprGraphComponents::Positions) {
+                auto filesCount = ReadVar32();
+                Files_.reserve(filesCount);
+                for (ui32 i = 0; i < filesCount; ++i) {
+                    ui32 length = ReadVar32();
+                    TStringBuf file(ReadMany(length), length);
+                    Files_.push_back(TString(file));
+                }
+
+                auto positionsCount = ReadVar32();
+                Positions_.reserve(positionsCount);
+                for (ui32 i = 0; i < positionsCount; ++i) {
+                    ui32 row = ReadVar32();
+                    ui32 column = ReadVar32();
+                    ui32 fileNum = ReadVar32();
+                    if (fileNum > Files_.size()) {
+                        ThrowCorrupted();
+                    }
+
+                    Positions_.push_back({ row, column, fileNum });
+                }
+            }
+
+            TExprNode::TPtr result = Fetch();
+            if (Current_ != End_) {
+                ThrowCorrupted();
+            }
+
+            return result;
+        } catch (const yexception& e) {
+            TIssue issue(Pos_, TStringBuilder() << "Failed to deserialize expression graph, reason:\n" << e.what());
+            issue.SetCode(UNEXPECTED_ERROR, ESeverity::TSeverityIds_ESeverityId_S_FATAL);
+            Ctx_.AddError(issue);
+            return nullptr;
+        }
+    }
+
+private:
+    TExprNode::TPtr Fetch() {
+        char command = Read();
+        if (!(command & NODE_VALUE)) {
+            ui32 nodeId = ReadVar32();
+            if (nodeId == 0 || nodeId > Nodes_.size()) {
+                ThrowCorrupted();
+            }
+
+            return Nodes_[nodeId - 1];
+        }
+
+
+        command &= ~NODE_VALUE;
+        TPosition pos = Pos_;
+        if (Components_ & TSerializedExprGraphComponents::Positions) {
+            if (command & SAME_POSITION) {
+                pos = LastPosition_;
+                command &= ~SAME_POSITION;
+            } else {
+                ui32 posNum = ReadVar32();
+                if (posNum >= Positions_.size()) {
+                    ThrowCorrupted();
+                }
+
+                const auto& posItem = Positions_[posNum];
+
+                pos = TPosition();
+                pos.Row = std::get<0>(posItem);
+                pos.Column = std::get<1>(posItem);
+                auto fileNum = std::get<2>(posItem);
+                if (fileNum > 0) {
+                    pos.File = Files_[fileNum - 1];
+                }
+
+                LastPosition_ = pos;
+            }
+        }
+
+        ui32 atomFlags = 0;
+        bool hasInlineStr = command & INLINE_STR;
+        command &= ~INLINE_STR;
+        if (command & ATOM_FLAG) {
+            atomFlags = command & TNodeFlags::FlagsMask;
+            command &= ~(ATOM_FLAG | TNodeFlags::FlagsMask);
+            command |= TExprNode::Atom;
+        }
+
+        const bool wide = command & WIDE;
+        command &= ~WIDE;
+
+        TStringBuf content;
+        if (command == TExprNode::Atom || command == TExprNode::Callable || command == TExprNode::Argument) {
+            if (hasInlineStr) {
+                ui32 length = ReadVar32();
+                content = TStringBuf(ReadMany(length), length);
+            } else {
+                ui32 strNum = ReadVar32();
+                if (strNum >= Strings_.size()) {
+                    ThrowCorrupted();
+                }
+
+                content = Strings_[strNum];
+            }
+        }
+
+        ui32 childrenSize = 0;
+        if (command == TExprNode::Callable || command == TExprNode::Arguments || command == TExprNode::List || (command == TExprNode::Lambda && wide)) {
+            childrenSize = ReadVar32();
+        }
+
+        TExprNode::TPtr ret;
+        switch (command) {
+        case TExprNode::Atom:
+            ret = Ctx_.NewAtom(pos, content, atomFlags);
+            break;
+        case TExprNode::List: {
+            TExprNode::TListType children;
+            children.reserve(childrenSize);
+            for (ui32 i = 0U; i < childrenSize; ++i) {
+                children.emplace_back(Fetch());
+            }
+
+            ret = Ctx_.NewList(pos, std::move(children));
+            break;
+        }
+
+        case TExprNode::Callable: {
+            TExprNode::TListType children;
+            children.reserve(childrenSize);
+            for (ui32 i = 0U; i < childrenSize; ++i) {
+                children.emplace_back(Fetch());
+            }
+
+            ret = Ctx_.NewCallable(pos, content, std::move(children));
+            break;
+        }
+
+        case TExprNode::Argument:
+            ret = Ctx_.NewArgument(pos, content);
+            break;
+
+        case TExprNode::Arguments: {
+            TExprNode::TListType children;
+            children.reserve(childrenSize);
+            for (ui32 i = 0U; i < childrenSize; ++i) {
+                children.emplace_back(Fetch());
+            }
+
+            ret = Ctx_.NewArguments(pos, std::move(children));
+            break;
+        }
+
+        case TExprNode::Lambda:
+            if (wide) {
+                TExprNode::TListType children;
+                children.reserve(childrenSize);
+                for (ui32 i = 0U; i < childrenSize; ++i) {
+                    children.emplace_back(Fetch());
+                }
+                ret = Ctx_.NewLambda(pos, std::move(children));
+            } else {
+                auto args = Fetch();
+                auto body = Fetch();
+                ret = Ctx_.NewLambda(pos, {std::move(args), std::move(body)});
+            }
+            break;
+
+        case TExprNode::World:
+            ret = Ctx_.NewWorld(pos);
+            break;
+
+        default:
+            ThrowCorrupted();
+        }
+
+        Nodes_.push_back(ret);
+        return ret;
+    }
+
+    Y_FORCE_INLINE char Read() {
+        if (Current_ == End_)
+            ThrowNoData();
+
+        return *Current_++;
+    }
+
+    Y_FORCE_INLINE const char* ReadMany(ui32 count) {
+        if (Current_ + count > End_)
+            ThrowNoData();
+
+        const char* result = Current_;
+        Current_ += count;
+        return result;
+    }
+
+    Y_FORCE_INLINE ui32 ReadVar32() {
+        ui32 result = 0;
+        size_t count = Unpack32(Current_, End_ - Current_, result);
+        if (!count) {
+            ThrowCorrupted();
+        }
+        Current_ += count;
+        return result;
+    }
+
+    [[noreturn]] static void ThrowNoData() {
+        ythrow yexception() << "No more data in buffer";
+    }
+
+    [[noreturn]] static void ThrowCorrupted() {
+        ythrow yexception() << "Serialized data is corrupted";
+    }
+
+private:
+    const TPosition Pos_;
+    const char* Current_;
+    const char* const End_;
+    TExprContext& Ctx_;
+    ui16 Components_;
+
+    TVector<TStringBuf> Strings_;
+    TVector<TString> Files_;
+    TVector<std::tuple<ui32, ui32, ui32>> Positions_;
+
+    TPosition LastPosition_;
+    TDeque<TExprNode::TPtr> Nodes_;
+};
+
+}
+
+TString SerializeGraph(const TExprNode& node, TExprContext& ctx, ui16 components) {
+    TWriter writer(ctx, components);
+    writer.Prepare(node);
+    writer.Init();
+    writer.Save(node);
+    return writer.Out();
+}
+
+TExprNode::TPtr DeserializeGraph(TPositionHandle pos, TStringBuf buffer, TExprContext& ctx) {
+    return DeserializeGraph(ctx.GetPosition(pos), buffer, ctx);
+}
+
+TExprNode::TPtr DeserializeGraph(TPosition pos, TStringBuf buffer, TExprContext& ctx) {
+    TReader reader(pos, buffer, ctx);
+    return reader.Load();
+}
+
+} // namespace NYql
+

+ 19 - 0
yql/essentials/ast/serialize/yql_expr_serialize.h

@@ -0,0 +1,19 @@
+#pragma once
+
+#include <yql/essentials/ast/yql_expr.h>
+
+namespace NYql {
+
+struct TSerializedExprGraphComponents {
+    enum : ui16 {
+        Graph = 0x00,
+        Positions = 0x01
+    };
+};
+
+TString SerializeGraph(const TExprNode& node, TExprContext& ctx, ui16 components = TSerializedExprGraphComponents::Graph);
+TExprNode::TPtr DeserializeGraph(TPositionHandle pos, TStringBuf buffer, TExprContext& ctx);
+TExprNode::TPtr DeserializeGraph(TPosition pos, TStringBuf buffer, TExprContext& ctx);
+
+} // namespace NYql
+

+ 18 - 0
yql/essentials/ast/ut/ya.make

@@ -0,0 +1,18 @@
+UNITTEST_FOR(yql/essentials/ast)
+
+FORK_SUBTESTS()
+
+SRCS(
+    yql_ast_ut.cpp
+    yql_expr_check_args_ut.cpp
+    yql_expr_builder_ut.cpp
+    yql_expr_ut.cpp
+    yql_type_string_ut.cpp
+    yql_constraint_ut.cpp
+)
+
+PEERDIR(
+    library/cpp/yson/node
+)
+
+END()

+ 48 - 0
yql/essentials/ast/ya.make

@@ -0,0 +1,48 @@
+LIBRARY()
+
+SRCS(
+    yql_ast.cpp
+    yql_ast.h
+    yql_constraint.cpp
+    yql_constraint.h
+    yql_ast_annotation.cpp
+    yql_ast_annotation.h
+    yql_ast_escaping.cpp
+    yql_ast_escaping.h
+    yql_errors.cpp
+    yql_errors.h
+    yql_expr.cpp
+    yql_expr.h
+    yql_expr_builder.cpp
+    yql_expr_builder.h
+    yql_expr_types.cpp
+    yql_expr_types.h
+    yql_gc_nodes.cpp
+    yql_gc_nodes.h
+    yql_type_string.cpp
+    yql_type_string.h
+)
+
+PEERDIR(
+    contrib/libs/openssl
+    library/cpp/colorizer
+    library/cpp/containers/sorted_vector
+    library/cpp/containers/stack_vector
+    library/cpp/deprecated/enum_codegen
+    library/cpp/enumbitset
+    library/cpp/string_utils/levenshtein_diff
+    library/cpp/yson
+    library/cpp/yson/node
+    yql/essentials/public/udf
+    yql/essentials/utils
+    yql/essentials/utils/fetch
+    yql/essentials/core/issue
+    yql/essentials/core/url_lister/interface
+    yql/essentials/parser/pg_catalog
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+    ut
+)

+ 665 - 0
yql/essentials/ast/yql_ast.cpp

@@ -0,0 +1,665 @@
+#include "yql_ast.h"
+#include "yql_ast_escaping.h"
+
+#include <util/string/builder.h>
+#include <util/system/compiler.h>
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+#include <yql/essentials/utils/utf8.h>
+
+#include <cstdlib>
+
+namespace NYql {
+
+namespace {
+
+    inline bool IsWhitespace(char c) {
+        return c == ' ' || c == '\n' || c == '\r' || c == '\t';
+    }
+
+    inline bool IsListStart(char c)      { return c == '(';  }
+    inline bool IsListEnd(char c)        { return c == ')';  }
+    inline bool IsCommentStart(char c)   { return c == '#';  }
+    inline bool IsQuoteStart(char c)     { return c == '\''; }
+    inline bool IsStringStart(char c)    { return c == '"';  }
+    inline bool IsHexStringStart(char c) { return c == 'x';  }
+    inline bool IsMultilineStringStart(char c) { return c == '@'; }
+
+    inline bool NeedEscaping(const TStringBuf& str) {
+        for (char ch: str) {
+            if (IsWhitespace(ch) || IsListStart(ch) ||
+                    IsListEnd(ch) || IsCommentStart(ch) ||
+                    IsQuoteStart(ch) || IsStringStart(ch) ||
+                    !isprint(ch & 0xff))
+            {
+                return true;
+            }
+        }
+
+        return str.empty();
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // TAstParser
+    ///////////////////////////////////////////////////////////////////////////
+    class TAstParserContext {
+    public:
+        inline TAstParserContext(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
+            : Str_(str)
+            , Position_(1, 1, file)
+            , Offset_(0)
+            , Pool_(externalPool)
+        {
+            if (!Pool_) {
+                InnerPool_ = std::make_unique<TMemoryPool>(4096);
+                Pool_ = InnerPool_.get();
+            }
+        }
+
+        inline char Peek() const {
+            Y_ABORT_UNLESS(!AtEnd());
+            return Str_[Offset_];
+        }
+
+        inline bool AtEnd() const {
+            return Str_.size() == Offset_;
+        }
+
+        inline char Next() {
+            Y_ABORT_UNLESS(!AtEnd());
+            char ch = Str_[Offset_];
+            if (ch == '\n') {
+                ++Position_.Row;
+                Position_.Column = 1;
+            } else {
+                ++Position_.Column;
+            }
+
+            ++Offset_;
+            return ch;
+        }
+
+        // stops right afetr stopChar
+        inline void SeekTo(char stopChar) {
+            while (!AtEnd() && Next() != stopChar) {
+                // empty loop
+            }
+        }
+
+        inline TStringBuf GetToken(ui32 begin, ui32 end) {
+            Y_ABORT_UNLESS(end >= begin);
+            return Str_.SubString(begin, end - begin);
+        }
+
+        inline bool IsAtomEnded() {
+            if (AtEnd()) {
+                return true;
+            }
+            char c = Peek();
+            return IsWhitespace(c) || IsListStart(c) || IsListEnd(c);
+        }
+
+        inline const TStringBuf& Str() const { return Str_; }
+        inline ui32 Offset() const { return Offset_; }
+        inline const TPosition& Position() const { return Position_; }
+        inline TMemoryPool& Pool() { return *Pool_; }
+        inline std::unique_ptr<TMemoryPool>&& InnerPool() { return std::move(InnerPool_); }
+
+    private:
+        TStringBuf Str_;
+        TPosition Position_;
+        ui32 Offset_;
+        TMemoryPool* Pool_;
+        std::unique_ptr<TMemoryPool> InnerPool_;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    // TAstParser
+    ///////////////////////////////////////////////////////////////////////////
+    class TAstParser {
+    public:
+        TAstParser(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
+            : Ctx_(str, externalPool, file)
+        {
+        }
+
+        TAstParseResult Parse() {
+            TAstNode* root = nullptr;
+            if (!IsUtf8(Ctx_.Str())) {
+               AddError("Invalid UTF8 input");
+            } else {
+               root = ParseList(0U);
+
+               SkipSpace();
+               if (!Ctx_.AtEnd()) {
+                  AddError("Unexpected symbols after end of root list");
+               }
+            }
+
+            TAstParseResult result;
+            if (!Issues_.Empty()) {
+                result.Issues = std::move(Issues_);
+            } else {
+                result.Root = root;
+                result.Pool = Ctx_.InnerPool();
+            }
+            return result;
+        }
+
+    private:
+        inline void AddError(const TString& message) {
+            Issues_.AddIssue(Ctx_.Position(), message);
+        }
+
+        inline void SkipComment() {
+            Ctx_.SeekTo('\n');
+        }
+
+        void SkipSpace() {
+            while (!Ctx_.AtEnd()) {
+                char c = Ctx_.Peek();
+                if (IsWhitespace(c)) {
+                    Ctx_.Next();
+                    continue;
+                }
+
+                if (IsCommentStart(c)) {
+                    SkipComment();
+                    continue;
+                }
+
+                break;
+            }
+        }
+
+        TAstNode* ParseList(size_t level) {
+            if (level >= 1000U) {
+                AddError("Too deep graph!");
+                return nullptr;
+            }
+
+            SkipSpace();
+
+            if (Ctx_.AtEnd()) {
+                AddError("Unexpected end");
+                return nullptr;
+            }
+
+            if (!IsListStart(Ctx_.Peek())) {
+                AddError("Expected (");
+                return nullptr;
+            }
+
+            Ctx_.Next();
+
+            TSmallVec<TAstNode*> children;
+            auto listPos = Ctx_.Position();
+            while (true) {
+                SkipSpace();
+
+                if (Ctx_.AtEnd()) {
+                    AddError("Expected )");
+                    return nullptr;
+                }
+
+                if (IsListEnd(Ctx_.Peek())) {
+                    Ctx_.Next();
+                    return TAstNode::NewList(listPos, children.data(), children.size(), Ctx_.Pool());
+                }
+
+                TAstNode* elem = ParseElement(level);
+                if (!elem)
+                    return nullptr;
+
+                children.push_back(elem);
+            }
+        }
+
+        TAstNode* ParseElement(size_t level) {
+            if (Ctx_.AtEnd()) {
+                AddError("Expected element");
+                return nullptr;
+            }
+
+            char c = Ctx_.Peek();
+            if (IsQuoteStart(c)) {
+                auto resPosition = Ctx_.Position();
+                Ctx_.Next();
+
+                char ch = Ctx_.Peek();
+                if (Ctx_.AtEnd() || IsWhitespace(ch) || IsCommentStart(ch) ||
+                        IsListEnd(ch))
+                {
+                    AddError("Expected quotation");
+                    return nullptr;
+                }
+
+                TAstNode* content = IsListStart(ch)
+                        ? ParseList(++level)
+                        : ParseAtom();
+                if (!content)
+                    return nullptr;
+
+                return TAstNode::NewList(resPosition, Ctx_.Pool(), &TAstNode::QuoteAtom, content);
+            }
+
+            if (IsListStart(c))
+                return ParseList(++level);
+
+            return ParseAtom();
+        }
+
+        TAstNode* ParseAtom() {
+            if (Ctx_.AtEnd()) {
+                AddError("Expected atom");
+                return nullptr;
+            }
+
+            auto resPosition = Ctx_.Position();
+            ui32 atomStart = Ctx_.Offset();
+
+            while (true) {
+                char c = Ctx_.Peek();
+                // special symbols = 0x20, 0x0a, 0x0d, 0x09 space
+                // 0x22, 0x23, 0x28, 0x29  "#()
+                // 0x27 '
+                // special symbols = 0x40, 0x78 @x
+                // &0x3f = 0x00,0x38
+#define MASK(x) (1ull << ui64(x))
+                const ui64 mask1 = MASK(0x20) | MASK(0x0a) | MASK(0x0d)
+                    | MASK(0x09) | MASK(0x22) | MASK(0x23) | MASK(0x28) | MASK(0x29) | MASK(0x27);
+                const ui64 mask2 = MASK(0x00) | MASK(0x38);
+#undef MASK
+                if (!(c & 0x80) && ((1ull << (c & 0x3f)) & (c <= 0x3f ? mask1 : mask2))) {
+                    if (IsWhitespace(c) || IsListStart(c) || IsListEnd(c))
+                        break;
+
+                    if (IsCommentStart(c)) {
+                        AddError("Unexpected comment");
+                        return nullptr;
+                    }
+
+                    if (IsQuoteStart(c)) {
+                        AddError("Unexpected quotation");
+                        return nullptr;
+                    }
+
+                    // multiline starts with '@@'
+                    if (IsMultilineStringStart(c)) {
+                        Ctx_.Next();
+                        if (Ctx_.AtEnd()) break;
+
+                        if (!IsMultilineStringStart(Ctx_.Peek())) {
+                            continue;
+                        }
+
+                        TString token;
+                        if (!TryParseMultilineToken(token)) {
+                            return nullptr;
+                        }
+
+                        if (!Ctx_.IsAtomEnded()) {
+                            AddError("Unexpected end of @@");
+                            return nullptr;
+                        }
+
+                        return TAstNode::NewAtom(resPosition, token, Ctx_.Pool(), TNodeFlags::MultilineContent);
+                    }
+                    // hex string starts with 'x"'
+                    else if (IsHexStringStart(c)) {
+                        Ctx_.Next(); // skip 'x'
+                        if (Ctx_.AtEnd()) break;
+
+                        if (!IsStringStart(Ctx_.Peek())) {
+                            continue;
+                        }
+
+                        Ctx_.Next(); // skip first '"'
+
+                        size_t readBytes = 0;
+                        TStringStream ss;
+                        TStringBuf atom = Ctx_.Str().SubStr(Ctx_.Offset());
+                        EUnescapeResult unescapeResult = UnescapeBinaryAtom(
+                                atom, '"', &ss, &readBytes);
+
+                        // advance position
+                        while (readBytes-- != 0) {
+                            Ctx_.Next();
+                        }
+
+                        if (unescapeResult != EUnescapeResult::OK) {
+                            AddError(TString(UnescapeResultToString(unescapeResult)));
+                            return nullptr;
+                        }
+
+                        Ctx_.Next(); // skip last '"'
+                        if (!Ctx_.IsAtomEnded()) {
+                            AddError("Unexpected end of \"");
+                            return nullptr;
+                        }
+
+                        return TAstNode::NewAtom(resPosition, ss.Str(), Ctx_.Pool(), TNodeFlags::BinaryContent);
+                    }
+                    else if (IsStringStart(c)) {
+                        if (Ctx_.Offset() != atomStart) {
+                            AddError("Unexpected \"");
+                            return nullptr;
+                        }
+
+                        Ctx_.Next(); // skip first '"'
+
+                        size_t readBytes = 0;
+                        TStringStream ss;
+                        TStringBuf atom = Ctx_.Str().SubStr(Ctx_.Offset());
+                        EUnescapeResult unescapeResult = UnescapeArbitraryAtom(
+                                atom, '"', &ss, &readBytes);
+
+                        // advance position
+                        while (readBytes-- != 0) {
+                            Ctx_.Next();
+                        }
+
+                        if (unescapeResult != EUnescapeResult::OK) {
+                            AddError(TString(UnescapeResultToString(unescapeResult)));
+                            return nullptr;
+                        }
+
+                        if (!Ctx_.IsAtomEnded()) {
+                            AddError("Unexpected end of \"");
+                            return nullptr;
+                        }
+
+                        return TAstNode::NewAtom(resPosition, ss.Str(), Ctx_.Pool(), TNodeFlags::ArbitraryContent);
+                    }
+                }
+
+                Ctx_.Next();
+                if (Ctx_.AtEnd()) {
+                    break;
+                }
+            }
+
+            return TAstNode::NewAtom(resPosition, Ctx_.GetToken(atomStart, Ctx_.Offset()), Ctx_.Pool());
+        }
+
+        bool TryParseMultilineToken(TString& token) {
+            Ctx_.Next(); // skip second '@'
+
+            ui32 start = Ctx_.Offset();
+            while (true) {
+                Ctx_.SeekTo('@');
+
+                if (Ctx_.AtEnd()) {
+                    AddError("Unexpected multiline atom end");
+                    return false;
+                }
+
+                ui32 count = 1; // we seek to first '@'
+                while (!Ctx_.AtEnd() && Ctx_.Peek() == '@') {
+                    count++;
+                    Ctx_.Next();
+                    if (count == 4) {
+                        // Reduce each four '@' to two
+                        token.append(Ctx_.GetToken(start, Ctx_.Offset() - 2));
+                        start = Ctx_.Offset();
+                        count = 0;
+                    }
+                }
+                if (count >= 2) {
+                    break;
+                }
+            }
+
+            // two '@@' at the end
+            token.append(Ctx_.GetToken(start, Ctx_.Offset() - 2));
+            return true;
+        }
+
+    private:
+        TAstParserContext Ctx_;
+        TIssues Issues_;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    // ast node printing functions
+    ///////////////////////////////////////////////////////////////////////////
+
+    inline bool IsQuoteNode(const TAstNode& node) {
+        return node.GetChildrenCount() == 2
+                && node.GetChild(0)->GetType() == TAstNode::Atom
+                && node.GetChild(0)->GetContent() == TStringBuf("quote");
+    }
+
+    inline bool IsBlockNode(const TAstNode& node) {
+        return node.GetChildrenCount() == 2
+                && node.GetChild(0)->GetType() == TAstNode::Atom
+                && node.GetChild(0)->GetContent() == TStringBuf("block");
+    }
+
+    Y_NO_INLINE void Indent(IOutputStream& out, ui32 indentation) {
+        char* whitespaces = reinterpret_cast<char*>(alloca(indentation));
+        memset(whitespaces, ' ', indentation);
+        out.Write(whitespaces, indentation);
+    }
+
+    void MultilineAtomPrint(IOutputStream& out, const TStringBuf& str) {
+        out << TStringBuf("@@");
+        size_t idx = str.find('@');
+        if (idx == TString::npos) {
+            out << str;
+        } else {
+            const char* begin = str.data();
+            do {
+                ui32 count = 0;
+                for (; idx < str.length() && str[idx] == '@'; ++idx) {
+                    ++count;
+                }
+
+                if (count % 2 == 0) {
+                    out.Write(begin, idx - (begin - str.data()) - count);
+                    begin = str.data() + idx;
+
+                    while (count--) {
+                        out.Write(TStringBuf("@@"));
+                    }
+                }
+                idx = str.find('@', idx);
+            } while (idx != TString::npos);
+            out.Write(begin, str.end() - begin);
+        }
+        out << TStringBuf("@@");
+    }
+
+    void PrintNode(IOutputStream& out, const TAstNode& node) {
+        if (node.GetType() == TAstNode::Atom) {
+            if (node.GetFlags() & TNodeFlags::ArbitraryContent) {
+                EscapeArbitraryAtom(node.GetContent(), '"', &out);
+            } else if (node.GetFlags() & TNodeFlags::BinaryContent) {
+                EscapeBinaryAtom(node.GetContent(), '"', &out);
+            } else if (node.GetFlags() & TNodeFlags::MultilineContent) {
+                MultilineAtomPrint(out, node.GetContent());
+            } else if (node.GetContent().empty()) {
+                EscapeArbitraryAtom(node.GetContent(), '"', &out);
+            } else {
+                out << node.GetContent();
+            }
+        } else if (node.GetType() == TAstNode::List) {
+            if (!node.GetChildrenCount()) {
+                out << TStringBuf("()");
+            } else if (IsQuoteNode(node)) {
+                out << '\'';
+                PrintNode(out, *node.GetChild(1));
+            } else {
+                out << '(';
+                ui32 index = 0;
+                while (true) {
+                    PrintNode(out, *node.GetChild(index));
+                    ++index;
+                    if (index == node.GetChildrenCount()) break;
+                    out  << ' ';
+                }
+                out << ')';
+            }
+        }
+    }
+
+    void PrettyPrintNode(
+            IOutputStream& out, const TAstNode& node,
+            i32 indent, i32 blockLevel, i32 localIndent, ui32 flags)
+    {
+        if (!(flags & TAstPrintFlags::PerLine)) {
+            Indent(out, indent * 2);
+        } else if (localIndent == 1) {
+            Indent(out, blockLevel * 2);
+        }
+
+        bool isQuote = false;
+        if (node.GetType() == TAstNode::Atom) {
+            if (node.GetFlags() & TNodeFlags::ArbitraryContent) {
+                if ((flags & TAstPrintFlags::AdaptArbitraryContent) &&
+                        !NeedEscaping(node.GetContent()))
+                {
+                    out << node.GetContent();
+                } else {
+                    EscapeArbitraryAtom(node.GetContent(), '"', &out);
+                }
+            } else if (node.GetFlags() & TNodeFlags::BinaryContent) {
+                EscapeBinaryAtom(node.GetContent(), '"', &out);
+            } else if (node.GetFlags() & TNodeFlags::MultilineContent) {
+                MultilineAtomPrint(out, node.GetContent());
+            } else {
+                if (((flags & TAstPrintFlags::AdaptArbitraryContent) && NeedEscaping(node.GetContent())) ||
+                    node.GetContent().empty())
+                {
+                    EscapeArbitraryAtom(node.GetContent(), '"', &out);
+                } else {
+                    out << node.GetContent();
+                }
+            }
+        } else if (node.GetType() == TAstNode::List) {
+            isQuote = IsQuoteNode(node);
+            if (isQuote && (flags & TAstPrintFlags::ShortQuote)) {
+                out << '\'';
+                if (localIndent == 0 || !(flags & TAstPrintFlags::PerLine)) {
+                    out << Endl;
+                }
+
+                PrettyPrintNode(out, *node.GetChild(1), indent + 1, blockLevel, localIndent + 1, flags);
+            } else {
+                out << '(';
+                if (localIndent == 0 || !(flags & TAstPrintFlags::PerLine)) {
+                    out << Endl;
+                }
+
+                bool isBlock = IsBlockNode(node);
+                for (ui32 index = 0; index < node.GetChildrenCount(); ++index) {
+                    if (localIndent > 0 && (index > 0) && (flags & TAstPrintFlags::PerLine)) {
+                        out << ' ';
+                    }
+
+                    if (isBlock && (index > 0)) {
+                        PrettyPrintNode(out, *node.GetChild(index), indent + 1, blockLevel + 1, -1, flags);
+                    } else {
+                        PrettyPrintNode(out, *node.GetChild(index), indent + 1, blockLevel, localIndent + 1, flags);
+                    }
+                }
+            }
+
+            if (!isQuote || !(flags & TAstPrintFlags::ShortQuote)) {
+                if (!(flags & TAstPrintFlags::PerLine)) {
+                    Indent(out, indent * 2);
+                }
+
+                if (localIndent == 0 && blockLevel > 0) {
+                    Indent(out, (blockLevel - 1) * 2);
+                }
+
+                out << ')';
+            }
+        }
+
+        if (!isQuote || !(flags & TAstPrintFlags::ShortQuote)) {
+            if (localIndent > 0 || blockLevel == 0) {
+                if (localIndent <= 1 || !(flags & TAstPrintFlags::PerLine)) {
+                    out << Endl;
+                }
+            }
+        }
+    }
+
+    void DestroyNode(TAstNode* node) {
+        if (node->IsList()) {
+            for (ui32 i = 0; i < node->GetChildrenCount(); ++i) {
+                DestroyNode(node->GetChild(i));
+            }
+        }
+
+        if (node != &TAstNode::QuoteAtom) {
+            node->Destroy();
+        }
+    }
+} // namespace
+
+TAstParseResult::~TAstParseResult() {
+    Destroy();
+}
+
+TAstParseResult::TAstParseResult(TAstParseResult&& other)
+    : Pool(std::move(other.Pool))
+    , Root(other.Root)
+    , Issues(std::move(other.Issues))
+    , PgAutoParamValues(std::move(other.PgAutoParamValues))
+    , ActualSyntaxType(other.ActualSyntaxType)
+{
+    other.Root = nullptr;
+}
+
+TAstParseResult& TAstParseResult::operator=(TAstParseResult&& other) {
+    Destroy();
+    Pool = std::move(other.Pool);
+    Root = other.Root;
+    other.Root = nullptr;
+    Issues = std::move(other.Issues);
+    PgAutoParamValues = std::move(other.PgAutoParamValues);
+    ActualSyntaxType = other.ActualSyntaxType;
+    return *this;
+}
+
+void TAstParseResult::Destroy() {
+    if (Root) {
+        DestroyNode(Root);
+        Root = nullptr;
+    }
+}
+
+TAstParseResult ParseAst(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
+{
+    TAstParser parser(str, externalPool, file);
+    return parser.Parse();
+}
+
+void TAstNode::PrintTo(IOutputStream& out) const {
+    PrintNode(out, *this);
+}
+
+void TAstNode::PrettyPrintTo(IOutputStream& out, ui32 flags) const {
+    PrettyPrintNode(out, *this, 0, 0, 0, flags);
+}
+
+TAstNode TAstNode::QuoteAtom(TPosition(0, 0), TStringBuf("quote"), TNodeFlags::Default);
+
+} // namespace NYql
+
+template<>
+void Out<NYql::TAstNode::EType>(class IOutputStream &o, NYql::TAstNode::EType x) {
+#define YQL_AST_NODE_TYPE_MAP_TO_STRING_IMPL(name, ...) \
+    case ::NYql::TAstNode::name: \
+        o << #name; \
+        return;
+
+    switch (x) {
+        YQL_AST_NODE_TYPE_MAP(YQL_AST_NODE_TYPE_MAP_TO_STRING_IMPL)
+    default:
+        o << static_cast<int>(x);
+        return;
+    }
+}

+ 355 - 0
yql/essentials/ast/yql_ast.h

@@ -0,0 +1,355 @@
+#pragma once
+
+#include "yql_errors.h"
+
+#include <library/cpp/deprecated/enum_codegen/enum_codegen.h>
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/output.h>
+#include <util/stream/str.h>
+#include <util/memory/pool.h>
+#include <util/generic/array_ref.h>
+
+namespace NYql {
+
+struct TNodeFlags {
+    enum : ui16 {
+        Default = 0,
+        ArbitraryContent = 0x01,
+        BinaryContent = 0x02,
+        MultilineContent = 0x04,
+    };
+
+    static constexpr ui32 FlagsMask = 0x07; // all flags should fit here
+};
+
+struct TAstNode {
+#define YQL_AST_NODE_TYPE_MAP(xx) \
+    xx(List, 0) \
+    xx(Atom, 1) \
+
+    enum EType : ui32 {
+        YQL_AST_NODE_TYPE_MAP(ENUM_VALUE_GEN)
+    };
+
+
+    static const ui32 SmallListCount = 2;
+
+    void PrintTo(IOutputStream& out) const;
+    void PrettyPrintTo(IOutputStream& out, ui32 prettyFlags) const;
+
+    inline TString ToString() const {
+        TStringStream str;
+        PrintTo(str);
+        return str.Str();
+    }
+
+    inline TString ToString(ui32 prettyFlags) const {
+        TStringStream str;
+        PrettyPrintTo(str, prettyFlags);
+        return str.Str();
+    }
+
+    inline EType GetType() const {
+        return Type;
+    }
+
+    inline bool IsAtom() const {
+        return Type == Atom;
+    }
+
+    inline bool IsList() const {
+        return Type == List;
+    }
+
+    inline bool IsListOfSize(ui32 len) const {
+        return Type == List && ListCount == len;
+    }
+
+    inline TPosition GetPosition() const {
+        return Position;
+    }
+
+    inline void SetPosition(TPosition position) {
+        Position = position;
+    }
+
+    inline TStringBuf GetContent() const {
+        Y_ABORT_UNLESS(IsAtom());
+        return TStringBuf(Data.A.Content, Data.A.Size);
+    }
+
+    inline void SetContent(TStringBuf newContent, TMemoryPool& pool) {
+        Y_ABORT_UNLESS(IsAtom());
+        auto poolContent = pool.AppendString(newContent);
+        Data.A.Content = poolContent.data();
+        Data.A.Size = poolContent.size();
+    }
+
+    inline void SetLiteralContent(TStringBuf newContent) {
+        Y_ABORT_UNLESS(IsAtom());
+        Data.A.Content = newContent.data();
+        Data.A.Size = newContent.size();
+    }
+
+    inline ui32 GetFlags() const {
+        Y_ABORT_UNLESS(IsAtom());
+        return Data.A.Flags;
+    }
+
+    inline void SetFlags(ui32 flags) {
+        Y_ABORT_UNLESS(IsAtom());
+        Data.A.Flags = flags;
+    }
+
+    inline ui32 GetChildrenCount() const {
+        Y_ABORT_UNLESS(IsList());
+        return ListCount;
+    }
+
+    inline const TAstNode* GetChild(ui32 index) const {
+        Y_ABORT_UNLESS(IsList());
+        Y_ABORT_UNLESS(index < ListCount);
+        if (ListCount <= SmallListCount) {
+            return Data.S.Children[index];
+        } else {
+            return Data.L.Children[index];
+        }
+    }
+
+    inline TAstNode* GetChild(ui32 index) {
+        Y_ABORT_UNLESS(IsList());
+        Y_ABORT_UNLESS(index < ListCount);
+        if (ListCount <= SmallListCount) {
+            return Data.S.Children[index];
+        } else {
+            return Data.L.Children[index];
+        }
+    }
+    
+    inline TArrayRef<TAstNode* const> GetChildren() const {
+        Y_ABORT_UNLESS(IsList());
+        return {ListCount <= SmallListCount ? Data.S.Children : Data.L.Children, ListCount};
+    }
+
+    static inline TAstNode* NewAtom(TPosition position, TStringBuf content, TMemoryPool& pool, ui32 flags = TNodeFlags::Default) {
+        auto poolContent = pool.AppendString(content);
+        auto ret = pool.Allocate<TAstNode>();
+        ::new(ret) TAstNode(position, poolContent, flags);
+        return ret;
+    }
+
+    // atom with non-owning content, useful for literal strings
+    static inline TAstNode* NewLiteralAtom(TPosition position, TStringBuf content, TMemoryPool& pool, ui32 flags = TNodeFlags::Default) {
+        auto ret = pool.Allocate<TAstNode>();
+        ::new(ret) TAstNode(position, content, flags);
+        return ret;
+    }
+
+    static inline TAstNode* NewList(TPosition position, TAstNode** children, ui32 childrenCount, TMemoryPool& pool) {
+        TAstNode** poolChildren = nullptr;
+        if (childrenCount) {
+            if (childrenCount > SmallListCount) {
+                poolChildren = pool.AllocateArray<TAstNode*>(childrenCount);
+                memcpy(poolChildren, children, sizeof(void*) * childrenCount);
+            } else {
+                poolChildren = children;
+            }
+
+            for (ui32 index = 0; index < childrenCount; ++index) {
+                Y_ABORT_UNLESS(poolChildren[index]);
+            }
+        }
+
+        auto ret = pool.Allocate<TAstNode>();
+        ::new(ret) TAstNode(position, poolChildren, childrenCount);
+        return ret;
+    }
+
+    template <typename... TNodes>
+    static inline TAstNode* NewList(TPosition position, TMemoryPool& pool, TNodes... nodes) {
+        TAstNode* children[] = { nodes... };
+        return NewList(position, children, sizeof...(nodes), pool);
+    }
+
+    static inline TAstNode* NewList(TPosition position, TMemoryPool& pool) {
+        return NewList(position, nullptr, 0, pool);
+    }
+
+    static TAstNode QuoteAtom;
+
+    static inline TAstNode* Quote(TPosition position, TMemoryPool& pool, TAstNode* node) {
+        return NewList(position, pool, &QuoteAtom, node);
+    }
+
+    inline ~TAstNode() {}
+
+    void Destroy() {
+        TString().swap(Position.File);
+    }
+
+private:
+    inline TAstNode(TPosition position, TStringBuf content, ui32 flags)
+        : Position(position)
+        , Type(Atom)
+        , ListCount(0)
+    {
+        Data.A.Content = content.data();
+        Data.A.Size = content.size();
+        Data.A.Flags = flags;
+    }
+
+    inline TAstNode(TPosition position, TAstNode** children, ui32 childrenCount)
+        : Position(position)
+        , Type(List)
+        , ListCount(childrenCount)
+    {
+        if (childrenCount <= SmallListCount) {
+            for (ui32 index = 0; index < childrenCount; ++index) {
+                Data.S.Children[index] = children[index];
+            }
+        } else {
+            Data.L.Children = children;
+        }
+    }
+
+    TPosition Position;
+    const EType Type;
+    const ui32 ListCount;
+
+    struct TAtom {
+        const char* Content;
+        ui32 Size;
+        ui32 Flags;
+    };
+
+    struct TListType {
+        TAstNode** Children;
+    };
+
+    struct TSmallList {
+        TAstNode* Children[SmallListCount];
+    };
+
+    union {
+        TAtom A;
+        TListType L;
+        TSmallList S;
+    } Data;
+};
+
+enum class ESyntaxType {
+    YQLv0,
+    YQLv1,
+    Pg,
+};
+
+class IAutoParamBuilder;
+class IAutoParamDataBuilder;
+
+class IAutoParamTypeBuilder {
+public:
+    virtual ~IAutoParamTypeBuilder() = default;
+
+    virtual void Pg(const TString& name) = 0;
+
+    virtual void BeginList() = 0;
+
+    virtual void EndList() = 0;
+
+    virtual void BeginTuple() = 0;
+
+    virtual void EndTuple() = 0;
+
+    virtual void BeforeItem() = 0;
+
+    virtual void AfterItem() = 0;
+
+    virtual IAutoParamDataBuilder& FinishType() = 0;
+};
+
+class IAutoParamDataBuilder {
+public:
+    virtual ~IAutoParamDataBuilder() = default;
+
+    virtual void Pg(const TMaybe<TString>& value) = 0;
+
+    virtual void BeginList() = 0;
+
+    virtual void EndList() = 0;
+
+    virtual void BeginTuple() = 0;
+
+    virtual void EndTuple() = 0;
+
+    virtual void BeforeItem() = 0;
+
+    virtual void AfterItem() = 0;
+
+    virtual IAutoParamBuilder& FinishData() = 0;
+};
+
+class IAutoParamBuilder : public TThrRefBase {
+public:
+    virtual ~IAutoParamBuilder() = default;
+
+    virtual ui32 Size() const = 0;
+
+    virtual bool Contains(const TString& name) const = 0;
+
+    virtual IAutoParamTypeBuilder& Add(const TString& name) = 0;
+};
+
+using IAutoParamBuilderPtr = TIntrusivePtr<IAutoParamBuilder>;
+
+class IAutoParamBuilderFactory {
+public:
+    virtual ~IAutoParamBuilderFactory() = default;
+
+    virtual IAutoParamBuilderPtr MakeBuilder() = 0;
+};
+
+struct TAstParseResult {
+    std::unique_ptr<TMemoryPool> Pool;
+    TAstNode* Root = nullptr;
+    TIssues Issues;
+    IAutoParamBuilderPtr PgAutoParamValues;
+    ESyntaxType ActualSyntaxType = ESyntaxType::YQLv1;
+
+    inline bool IsOk() const {
+        return !!Root;
+    }
+
+    TAstParseResult() = default;
+    ~TAstParseResult();
+    TAstParseResult(const TAstParseResult&) = delete;
+    TAstParseResult& operator=(const TAstParseResult&) = delete;
+
+    TAstParseResult(TAstParseResult&&);
+    TAstParseResult& operator=(TAstParseResult&&);
+
+    void Destroy();
+};
+
+struct TStmtParseInfo {
+    bool KeepInCache = true;
+    TMaybe<TString> CommandTagName = {};
+};
+
+struct TAstPrintFlags {
+    enum {
+        Default = 0,
+        PerLine = 0x01,
+        ShortQuote = 0x02,
+        AdaptArbitraryContent = 0x04,
+    };
+};
+
+TAstParseResult ParseAst(const TStringBuf& str, TMemoryPool* externalPool = nullptr, const TString& file = {});
+
+} // namespace NYql
+
+template<>
+void Out<NYql::TAstNode::EType>(class IOutputStream &o, NYql::TAstNode::EType x);

+ 189 - 0
yql/essentials/ast/yql_ast_annotation.cpp

@@ -0,0 +1,189 @@
+#include "yql_ast_annotation.h"
+#include <util/string/printf.h>
+#include <util/string/split.h>
+#include <util/string/cast.h>
+#include <util/string/builder.h>
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+
+namespace NYql {
+
+namespace {
+
+TAstNode* AnnotateNodePosition(TAstNode& node, TMemoryPool& pool) {
+    auto newPosition = node.GetPosition();
+    TAstNode* pos = PositionAsNode(node.GetPosition(), pool);
+    TAstNode* shallowClone = &node;
+    if (node.IsList()) {
+        TSmallVec<TAstNode*> listChildren(node.GetChildrenCount());
+        for (ui32 index = 0; index < node.GetChildrenCount(); ++index) {
+            listChildren[index] = AnnotateNodePosition(*node.GetChild(index), pool);
+        }
+
+        shallowClone = TAstNode::NewList(node.GetPosition(), listChildren.data(), listChildren.size(), pool);
+    }
+
+    return TAstNode::NewList(newPosition, pool, pos, shallowClone);
+}
+
+TAstNode* RemoveNodeAnnotations(TAstNode& node, TMemoryPool& pool) {
+    if (!node.IsList())
+        return nullptr;
+
+    if (node.GetChildrenCount() == 0)
+        return nullptr;
+
+    auto lastNode = node.GetChild(node.GetChildrenCount() - 1);
+    auto res = lastNode;
+    if (lastNode->IsList()) {
+        TSmallVec<TAstNode*> listChildren(lastNode->GetChildrenCount());
+        for (ui32 index = 0; index < lastNode->GetChildrenCount(); ++index) {
+            auto item = RemoveNodeAnnotations(*lastNode->GetChild(index), pool);
+            if (!item)
+                return nullptr;
+
+            listChildren[index] = item;
+        }
+
+        res = TAstNode::NewList(lastNode->GetPosition(), listChildren.data(), listChildren.size(), pool);
+    }
+
+    return res;
+}
+
+TAstNode* ExtractNodeAnnotations(TAstNode& node, TAnnotationNodeMap& annotations, TMemoryPool& pool) {
+    if (!node.IsList())
+        return nullptr;
+
+    if (node.GetChildrenCount() == 0)
+        return nullptr;
+
+    auto lastNode = node.GetChild(node.GetChildrenCount() - 1);
+    auto res = lastNode;
+    if (lastNode->IsList()) {
+        TSmallVec<TAstNode*> listChildren(lastNode->GetChildrenCount());
+        for (ui32 index = 0; index < lastNode->GetChildrenCount(); ++index) {
+            auto item = ExtractNodeAnnotations(*lastNode->GetChild(index), annotations, pool);
+            if (!item)
+                return nullptr;
+
+            listChildren[index] = item;
+        }
+
+        res = TAstNode::NewList(lastNode->GetPosition(), listChildren.data(), listChildren.size(), pool);
+    }
+
+    auto& v = annotations[res];
+    v.resize(node.GetChildrenCount() - 1);
+    for (ui32 index = 0; index + 1 < node.GetChildrenCount(); ++index) {
+        v[index] = node.GetChild(index);
+    }
+
+    return res;
+}
+
+TAstNode* ApplyNodePositionAnnotations(TAstNode& node, ui32 annotationIndex, TMemoryPool& pool) {
+    if (!node.IsList())
+        return nullptr;
+
+    if (node.GetChildrenCount() < annotationIndex + 2)
+        return nullptr;
+
+    auto annotation = node.GetChild(annotationIndex);
+    auto str = annotation->GetContent();
+    TStringBuf rowPart;
+    TStringBuf colPart;
+    TString filePart;
+    GetNext(str, ':', rowPart);
+    GetNext(str, ':', colPart);
+    filePart = str;
+
+    ui32 row = 0, col = 0;
+    if (!TryFromString(rowPart, row) || !TryFromString(colPart, col))
+        return nullptr;
+
+    TSmallVec<TAstNode*> listChildren(node.GetChildrenCount());
+    for (ui32 index = 0; index < node.GetChildrenCount() - 1; ++index) {
+        listChildren[index] = node.GetChild(index);
+    }
+
+    auto lastNode = node.GetChild(node.GetChildrenCount() - 1);
+    TAstNode* lastResNode;
+    if (lastNode->IsAtom()) {
+        lastResNode = TAstNode::NewAtom(TPosition(col, row, filePart), lastNode->GetContent(), pool, lastNode->GetFlags());
+    } else {
+        TSmallVec<TAstNode*> lastNodeChildren(lastNode->GetChildrenCount());
+        for (ui32 index = 0; index < lastNode->GetChildrenCount(); ++index) {
+            lastNodeChildren[index] = ApplyNodePositionAnnotations(*lastNode->GetChild(index), annotationIndex, pool);
+        }
+
+        lastResNode = TAstNode::NewList(TPosition(col, row, filePart), lastNodeChildren.data(), lastNodeChildren.size(), pool);
+    }
+
+    listChildren[node.GetChildrenCount() - 1] = lastResNode;
+    return TAstNode::NewList(node.GetPosition(), listChildren.data(), listChildren.size(), pool);
+}
+
+bool ApplyNodePositionAnnotationsInplace(TAstNode& node, ui32 annotationIndex) {
+    if (!node.IsList())
+        return false;
+
+    if (node.GetChildrenCount() < annotationIndex + 2)
+        return false;
+
+    auto annotation = node.GetChild(annotationIndex);
+    TStringBuf str = annotation->GetContent();
+    TStringBuf rowPart;
+    TStringBuf colPart;
+    TString filePart;
+    GetNext(str, ':', rowPart);
+    GetNext(str, ':', colPart);
+    filePart = str;
+    ui32 row = 0, col = 0;
+    if (!TryFromString(rowPart, row) || !TryFromString(colPart, col))
+        return false;
+
+    auto lastNode = node.GetChild(node.GetChildrenCount() - 1);
+    lastNode->SetPosition(TPosition(col, row, filePart));
+    if (lastNode->IsList()) {
+        for (ui32 index = 0; index < lastNode->GetChildrenCount(); ++index) {
+            if (!ApplyNodePositionAnnotationsInplace(*lastNode->GetChild(index), annotationIndex))
+                return false;
+        }
+    }
+
+    return true;
+}
+
+}
+
+TAstNode* AnnotatePositions(TAstNode& root, TMemoryPool& pool) {
+    return AnnotateNodePosition(root, pool);
+}
+
+TAstNode* RemoveAnnotations(TAstNode& root, TMemoryPool& pool) {
+    return RemoveNodeAnnotations(root, pool);
+}
+
+TAstNode* ApplyPositionAnnotations(TAstNode& root, ui32 annotationIndex, TMemoryPool& pool) {
+    return ApplyNodePositionAnnotations(root, annotationIndex, pool);
+}
+
+bool ApplyPositionAnnotationsInplace(TAstNode& root, ui32 annotationIndex) {
+    return ApplyNodePositionAnnotationsInplace(root, annotationIndex);
+}
+
+TAstNode* PositionAsNode(TPosition position, TMemoryPool& pool) {
+    TStringBuilder str;
+    str << position.Row << ':' << position.Column;
+    if (!position.File.empty()) {
+        str << ':' << position.File;
+    }
+
+    return TAstNode::NewAtom(position, str, pool);
+}
+
+TAstNode* ExtractAnnotations(TAstNode& root, TAnnotationNodeMap& annotations, TMemoryPool& pool) {
+    return ExtractNodeAnnotations(root, annotations, pool);
+}
+
+}

+ 22 - 0
yql/essentials/ast/yql_ast_annotation.h

@@ -0,0 +1,22 @@
+#pragma once
+#include "yql_ast.h"
+#include <util/generic/hash.h>
+
+namespace NYql {
+
+TAstNode* PositionAsNode(TPosition position, TMemoryPool& pool);
+
+TAstNode* AnnotatePositions(TAstNode& root, TMemoryPool& pool);
+// returns nullptr in case of error
+TAstNode* RemoveAnnotations(TAstNode& root, TMemoryPool& pool);
+// returns nullptr in case of error
+TAstNode* ApplyPositionAnnotations(TAstNode& root, ui32 annotationIndex, TMemoryPool& pool);
+// returns false in case of error
+bool ApplyPositionAnnotationsInplace(TAstNode& root, ui32 annotationIndex);
+
+typedef THashMap<const TAstNode*, TVector<const TAstNode*>> TAnnotationNodeMap;
+
+// returns nullptr in case of error
+TAstNode* ExtractAnnotations(TAstNode& root, TAnnotationNodeMap& annotations, TMemoryPool& pool);
+
+}

+ 275 - 0
yql/essentials/ast/yql_ast_escaping.cpp

@@ -0,0 +1,275 @@
+#include "yql_ast_escaping.h"
+
+#include <util/charset/wide.h>
+#include <util/stream/output.h>
+#include <util/string/hex.h>
+
+
+namespace NYql {
+
+static char HexDigit(char c)
+{
+    return (c < 10 ? '0' + c : 'A' + (c - 10));
+}
+
+static void EscapedPrintChar(ui8 c, IOutputStream* out)
+{
+    switch (c) {
+        case '\\': out->Write("\\\\", 2); break;
+        case '"' : out->Write("\\\"", 2); break;
+        case '\t': out->Write("\\t", 2); break;
+        case '\n': out->Write("\\n", 2); break;
+        case '\r': out->Write("\\r", 2); break;
+        case '\b': out->Write("\\b", 2); break;
+        case '\f': out->Write("\\f", 2); break;
+        case '\a': out->Write("\\a", 2); break;
+        case '\v': out->Write("\\v", 2); break;
+        default: {
+            if (isprint(c)) out->Write(static_cast<char>(c));
+            else {
+                char buf[4] = { "\\x" };
+                buf[2] = HexDigit((c & 0xf0) >> 4);
+                buf[3] = HexDigit((c & 0x0f));
+                out->Write(buf, 4);
+            }
+        }
+    }
+}
+
+static void EscapedPrintUnicode(wchar32 rune, IOutputStream* out)
+{
+    static const int MAX_ESCAPE_LEN = 10;
+
+    if (rune < 0x80) {
+        EscapedPrintChar(static_cast<ui8>(rune & 0xff), out);
+    } else {
+        int i = 0;
+        char buf[MAX_ESCAPE_LEN];
+
+        if (rune < 0x10000) {
+            buf[i++] = '\\';
+            buf[i++] = 'u';
+        } else {
+            buf[i++] = '\\';
+            buf[i++] = 'U';
+            buf[i++] = HexDigit((rune & 0xf0000000) >> 28);
+            buf[i++] = HexDigit((rune & 0x0f000000) >> 24);
+            buf[i++] = HexDigit((rune & 0x00f00000) >> 20);
+            buf[i++] = HexDigit((rune & 0x000f0000) >> 16);
+        }
+
+        buf[i++] = HexDigit((rune & 0xf000) >> 12);
+        buf[i++] = HexDigit((rune & 0x0f00) >> 8);
+        buf[i++] = HexDigit((rune & 0x00f0) >> 4);
+        buf[i++] = HexDigit((rune & 0x000f));
+
+        out->Write(buf, i);
+    }
+}
+
+static bool TryParseOctal(const char*& p, const char* e, int maxlen, wchar32* value)
+{
+    while (maxlen-- && p != e) {
+        if (*value > 255) return false;
+
+        char ch = *p++;
+        if (ch >= '0' && ch <= '7') {
+            *value = *value * 8 + (ch - '0');
+            continue;
+        }
+
+        break;
+    }
+
+    return (maxlen == -1);
+}
+
+static bool TryParseHex(const char*& p, const char* e, int maxlen, wchar32* value)
+{
+    while (maxlen-- > 0 && p != e) {
+        char ch = *p++;
+        if (ch >= '0' && ch <= '9') {
+            *value = *value * 16 + (ch - '0');
+            continue;
+        }
+
+        // to lower case
+        ch |= 0x20;
+
+        if (ch >= 'a' && ch <= 'f') {
+            *value = *value * 16 + (ch - 'a') + 10;
+            continue;
+        }
+
+        break;
+    }
+
+    return (maxlen == -1);
+}
+
+static bool IsValidUtf8Rune(wchar32 value) {
+    return value <= 0x10ffff && (value < 0xd800 || value > 0xdfff);
+}
+
+TStringBuf UnescapeResultToString(EUnescapeResult result)
+{
+    switch (result) {
+        case EUnescapeResult::OK:
+            return "OK";
+        case EUnescapeResult::INVALID_ESCAPE_SEQUENCE:
+            return "Expected escape sequence";
+        case EUnescapeResult::INVALID_BINARY:
+            return "Invalid binary value";
+        case EUnescapeResult::INVALID_OCTAL:
+            return "Invalid octal value";
+        case EUnescapeResult::INVALID_HEXADECIMAL:
+            return "Invalid hexadecimal value";
+        case EUnescapeResult::INVALID_UNICODE:
+            return "Invalid unicode value";
+        case EUnescapeResult::INVALID_END:
+            return "Unexpected end of atom";
+    }
+    return "Unknown unescape error";
+}
+
+void EscapeArbitraryAtom(TStringBuf atom, char quoteChar, IOutputStream* out)
+{
+    out->Write(quoteChar);
+    const ui8 *p = reinterpret_cast<const ui8*>(atom.begin()),
+              *e = reinterpret_cast<const ui8*>(atom.end());
+    while (p != e) {
+        wchar32 rune = 0;
+        size_t rune_len = 0;
+
+        if (SafeReadUTF8Char(rune, rune_len, p, e) == RECODE_RESULT::RECODE_OK && IsValidUtf8Rune(rune)) {
+            EscapedPrintUnicode(rune, out);
+            p += rune_len;
+        } else {
+            EscapedPrintChar(*p++, out);
+        }
+    }
+    out->Write(quoteChar);
+}
+
+EUnescapeResult UnescapeArbitraryAtom(
+        TStringBuf atom, char endChar, IOutputStream* out, size_t* readBytes)
+{
+    const char *p = atom.begin(),
+               *e = atom.end();
+
+    while (p != e) {
+        char current = *p++;
+
+        // C-style escape sequences
+        if (current == '\\') {
+            if (p == e) {
+                *readBytes = p - atom.begin();
+                return EUnescapeResult::INVALID_ESCAPE_SEQUENCE;
+            }
+
+            char next = *p++;
+            switch (next) {
+                case 't': current = '\t'; break;
+                case 'n': current = '\n'; break;
+                case 'r': current = '\r'; break;
+                case 'b': current = '\b'; break;
+                case 'f': current = '\f'; break;
+                case 'a': current = '\a'; break;
+                case 'v': current = '\v'; break;
+                case '0': case '1': case '2': case '3': {
+                    wchar32 value = (next - '0');
+                    if (!TryParseOctal(p, e, 2, &value)) {
+                        *readBytes = p - atom.begin();
+                        return EUnescapeResult::INVALID_OCTAL;
+                    }
+                    current = value & 0xff;
+                    break;
+                }
+                case 'x': {
+                    wchar32 value = 0;
+                    if (!TryParseHex(p, e, 2, &value)) {
+                        *readBytes = p - atom.begin();
+                        return EUnescapeResult::INVALID_HEXADECIMAL;
+                    }
+                    current = value & 0xff;
+                    break;
+                }
+                case 'u':
+                case 'U': {
+                    wchar32 value = 0;
+                    int len = (next == 'u' ? 4 : 8);
+                    if (!TryParseHex(p, e, len, &value) || !IsValidUtf8Rune(value)) {
+                        *readBytes = p - atom.begin();
+                        return EUnescapeResult::INVALID_UNICODE;
+                    }
+                    size_t written = 0;
+                    char buf[4];
+                    WideToUTF8(&value, 1, buf, written);
+                    out->Write(buf, written);
+                    continue;
+                }
+                default: {
+                    current = next;
+                }
+            }
+        } else if (endChar == '`') {
+            if (current == '`') {
+                if (p == e) {
+                    *readBytes = p - atom.begin();
+                    return EUnescapeResult::OK;
+                } else {
+                    if (*p != '`') {
+                        *readBytes = p - atom.begin();
+                        return EUnescapeResult::INVALID_ESCAPE_SEQUENCE;
+                    } else {
+                        p++;
+                    }
+                }
+            }
+        } else if (current == endChar) {
+            *readBytes = p - atom.begin();
+            return EUnescapeResult::OK;
+        }
+
+        out->Write(current);
+    }
+
+    *readBytes = p - atom.begin();
+    return EUnescapeResult::INVALID_END;
+}
+
+void EscapeBinaryAtom(TStringBuf atom, char quoteChar, IOutputStream* out)
+{
+    char prefix[] = { 'x', quoteChar };
+    out->Write(prefix, 2);
+    out->Write(HexEncode(atom.data(), atom.size()));
+    out->Write(quoteChar);
+}
+
+EUnescapeResult UnescapeBinaryAtom(
+        TStringBuf atom, char endChar, IOutputStream* out, size_t* readBytes)
+{
+    const char *p = atom.begin(),
+               *e = atom.end();
+
+    while (p != e) {
+        char current = *p;
+        if (current == endChar) {
+            *readBytes = p - atom.begin();
+            return EUnescapeResult::OK;
+        }
+
+        wchar32 byte = 0;
+        if (!TryParseHex(p, e, 2, &byte)) {
+            *readBytes = p - atom.begin();
+            return EUnescapeResult::INVALID_BINARY;
+        }
+
+        out->Write(byte & 0xff);
+    }
+
+    *readBytes = p - atom.begin();
+    return EUnescapeResult::INVALID_END;
+}
+
+} // namspace NYql

Some files were not shown because too many files changed in this diff