Browse Source

Prepare move yql/minikql YQL-19206

types,jsonpath,dom
commit_hash:6b54be5968b6a30b6d97fe3a1611574bcefc749e
vvvv 4 months ago
parent
commit
cf2a23963a

+ 316 - 0
yql/essentials/core/cbo/cbo_hints.cpp

@@ -0,0 +1,316 @@
+#include "cbo_optimizer_new.h"
+
+#include <util/string/join.h>
+#include <util/string/printf.h>
+#include <library/cpp/iterator/zip.h>
+
+using namespace NYql;
+
+class TOptimizerHintsParser {
+public:
+    TOptimizerHintsParser(const TString& text) 
+        : Pos(-1)
+        , Size(static_cast<i32>(text.size()) - 1)
+        , Text(text)
+    {}
+
+    TOptimizerHints Parse() {
+        Start();
+        return Hints;
+    }
+
+private:
+    void Start() {
+        while (Pos < Size) {
+            auto hintType = Keyword({"JoinOrder", "Leading", "JoinType", "Rows"});
+            if (hintType == "JoinOrder" || hintType == "Leading") {
+                JoinOrder(hintType == "Leading");
+            } else if (hintType == "JoinType") {
+                JoinType();
+            } else if (hintType == "Rows"){
+                Rows();
+            } else {
+                ParseError(Sprintf("Undefined hints type: %s", hintType.c_str()), Pos - hintType.size());
+            }
+
+            SkipWhiteSpaces();
+        }
+    }
+
+    TVector<TString> CollectLabels() {
+        TVector<TString> labels;
+        while (auto maybeTerm = MaybeLabel()) {
+            labels.push_back(maybeTerm.value());
+        }
+        return labels;
+    }
+
+    void JoinType() {        
+        i32 beginPos = Pos + 1;
+        
+        Keyword({"("});
+
+        i32 labelsBeginPos = Pos + 1;
+        TVector<TString> labels = CollectLabels();
+        if (labels.size() <= 1) {
+            ParseError(Sprintf("Bad labels for JoinType hint: %s, example of the format: JoinType(t1 t2 Shuffle)", JoinSeq(", ", labels).c_str()), labelsBeginPos);
+        }
+        TString reqJoinAlgoStr = std::move(labels.back());
+        labels.pop_back();
+
+        Keyword({")"});
+        
+        TVector<EJoinAlgoType> joinAlgos = {EJoinAlgoType::GraceJoin, EJoinAlgoType::LookupJoin, EJoinAlgoType::MapJoin};
+        TVector<TString> joinAlgosStr = {"Shuffle", "Lookup", "Broadcast"};
+
+        for (const auto& [JoinType, joinAlgoStr]: Zip(joinAlgos, joinAlgosStr)) {
+            if (reqJoinAlgoStr == joinAlgoStr) {
+                Hints.JoinAlgoHints->PushBack(std::move(labels), JoinType, "JoinType" + Text.substr(beginPos, Pos - beginPos + 1));
+                return;
+            }
+        }
+       
+        ParseError(Sprintf("Unknown JoinType: '%s', supported algos: [%s]", reqJoinAlgoStr.c_str(), JoinSeq(", ", joinAlgosStr).c_str()), Pos - reqJoinAlgoStr.size());
+        Y_UNREACHABLE();
+    }
+
+    void JoinOrder(bool leading /* is keyword "Leading" or "JoinOrder" */) {
+        i32 beginPos = Pos + 1;
+
+        Keyword({"("});
+        auto joinOrderHintTree = JoinOrderLabels();
+        Keyword({")"});
+
+        Hints.JoinOrderHints->PushBack(
+            std::move(joinOrderHintTree), 
+            leading? "Leading" : "JoinOrder" + Text.substr(beginPos, Pos - beginPos + 1)
+        );
+    }
+
+    std::shared_ptr<TJoinOrderHints::ITreeNode> JoinOrderLabels() {
+        auto lhs = JoinOrderLabel();
+        auto rhs = JoinOrderLabel();
+        return std::make_shared<TJoinOrderHints::TJoinNode>(std::move(lhs), std::move(rhs));
+    }
+
+    std::shared_ptr<TJoinOrderHints::ITreeNode> JoinOrderLabel() {
+        if (auto maybeLabel = MaybeLabel()) {
+            return std::make_shared<TJoinOrderHints::TRelationNode>(std::move(maybeLabel.value()));
+        } else if (auto maybeBracket = MaybeKeyword({"("})) {
+            auto join = JoinOrderLabels();
+            Keyword({")"});
+            return join;
+        } 
+
+        ParseError(Sprintf("JoinOrder args must be either a relation, either a join, example of the format: JoinOrder(t1 (t2 t3))"), Pos);
+        Y_UNREACHABLE();
+    }
+
+    void Rows() {
+        i32 beginPos = Pos + 1;
+
+        Keyword({"("});
+
+        TVector<TString> labels = CollectLabels();
+        auto signStr = Keyword({"+", "-", "/", "*", "#"});
+        char sign = signStr[0];
+        auto value = Number();
+        Keyword({")"});
+        
+        TCardinalityHints::ECardOperation op;
+        switch (sign) {
+            case '+': { op = TCardinalityHints::ECardOperation::Add; break; }
+            case '-': { op = TCardinalityHints::ECardOperation::Subtract; break; }
+            case '/': { op = TCardinalityHints::ECardOperation::Divide; break; }
+            case '*': { op = TCardinalityHints::ECardOperation::Multiply; break; }
+            case '#': { op = TCardinalityHints::ECardOperation::Replace; break; }
+            default: {ParseError(Sprintf("Unknown operation: '%c'", sign), Pos - 1); Y_UNREACHABLE();}
+        }
+
+        Hints.CardinalityHints->PushBack(std::move(labels), op, value, "Rows" + Text.substr(beginPos, Pos - beginPos + 1));
+    }
+
+private:
+    // Expressions
+    void ParseError(const TString& err, i32 pos) {
+        auto [line, linePos] = GetLineAndLinePosFromTextPos(pos);
+        Y_ENSURE(false, Sprintf("Optimizer hints parser error at [line:%d, pos:%d], msg: %s", line, linePos, err.c_str()));
+    }
+
+    TString Label() {
+        return Term(Letters() | Digits());
+    }
+
+    std::optional<TString> MaybeLabel() {
+        try {
+            return Label();
+        } catch (...) {
+            return std::nullopt;
+        }
+    }
+
+    TString Term(const std::bitset<256>& allowedSym = {}) {
+        SkipWhiteSpaces();
+        Y_ENSURE(Pos < Size, "Expected <string>, but got end of the string.");
+
+        TString term;
+        while (Pos < Size) {
+            try {
+                term.push_back(Char(allowedSym));
+            } catch (...) {
+                break;
+            }
+        }
+
+        if (term.empty()) {
+            ParseError("Expected a term!", Pos);
+        }
+        return term;
+    }
+
+    char Char(unsigned char c) {
+        std::bitset<256> allowed;
+        allowed[c] = 1; 
+        return Char(allowed);
+    }
+
+    char Char(unsigned char intervalBegin, unsigned char intervalEnd) {
+        std::bitset<256> allowed;
+        for (size_t i = intervalBegin; i <= intervalEnd; ++i) {
+            allowed[i] = 1;
+        }
+        return Char(allowed);
+    }
+ 
+    char Char(const std::bitset<256>& allowedSymbols = {}) {
+        Y_ENSURE(Pos < Size, Sprintf("Expected [%s], but got end of the string.", ""));
+
+        char nextSym = Text[Pos + 1];
+        if (allowedSymbols.count() == 0) {
+            ++Pos;
+            return nextSym;
+        }
+
+        for (size_t i = 0; i < allowedSymbols.size(); ++i) {
+            if (allowedSymbols[i] && tolower(i) == tolower(nextSym)) {
+                ++Pos;
+                return nextSym;
+            }
+        }
+
+        ParseError(Sprintf("Expected [%s], but got [%c]", "", nextSym), Pos);
+        Y_UNREACHABLE();
+    }
+
+    std::optional<TString> MaybeKeyword(const TVector<TString>& keywords) {
+        try {
+            return Keyword(keywords);
+        } catch(...) {
+            return std::nullopt;
+        }
+    }
+
+    TString Keyword(const TVector<TString>& keywords) {
+        SkipWhiteSpaces();
+        Y_ENSURE(Pos < Size, Sprintf("Expected [%s], but got end of the string.", JoinSeq(", ", keywords).c_str()));
+
+        for (const auto& keyword: keywords) {
+            size_t lowInclude = Pos + 1;
+            size_t highExclude = lowInclude + keyword.size();
+
+            if (Text.substr(lowInclude, highExclude - lowInclude).equal(keyword)) {
+                Pos += keyword.size();
+                return keyword;
+            }
+        }
+
+        ParseError(Sprintf("Expected [%s], but got [%c]", JoinSeq(", ", keywords).c_str(), Text[Pos + 1]), Pos);
+        Y_UNREACHABLE();
+    }
+
+    double Number() {
+        SkipWhiteSpaces();
+        Y_ENSURE(Pos < Size, Sprintf("Expected number, but got end of the string."));
+
+        TString number;
+        if (auto maybeSign = MaybeKeyword({"+", "-"})) {
+            number.push_back(maybeSign.value()[0]);
+        }
+
+        auto term = Term(Digits() | Chars(".-e")); // for double like 1.0 / 1e9
+        try {
+            return std::stod(term);
+        } catch (...) {
+            ParseError(Sprintf("Expected a number, got [%s]", term.c_str()), Pos - term.size());
+        }
+        Y_UNREACHABLE();
+    }
+
+private:
+    // Helpers
+    constexpr std::bitset<256> Chars(const TString& s) {
+        std::bitset<256> res;
+
+        for (char c: s) {
+            res[c] = 1;
+        }
+
+        return res;
+    }
+
+    constexpr std::bitset<256> Letters() {
+        std::bitset<256> res;
+
+        for (unsigned char i = 'a'; i <= 'z'; ++i) {
+            res[i] = 1;
+        }
+        for (unsigned char i = 'A'; i <= 'Z'; ++i) {
+            res[i] = 1;
+        }
+
+        return res;
+    }
+
+    constexpr std::bitset<256> Digits() {
+        std::bitset<256> res;
+
+        for (unsigned char i = '0'; i <= '9'; ++i) {
+            res[i] = 1;
+        }
+
+        return res;
+    }
+
+    void SkipWhiteSpaces() {
+        for (; Pos < Size && isspace(Text[Pos + 1]); ++Pos) {
+        }
+    }
+
+    std::pair<i32, i32> GetLineAndLinePosFromTextPos(i32 pos) {
+        i32 Line = 0;
+        i32 LinePos = 0;
+
+        for (i32 i = 0; i <= pos && i < static_cast<i32>(Text.size()); ++i) {
+            if (Text[i] == '\n') {
+                LinePos = 0;
+                ++Line;
+            } else {
+                ++LinePos;
+            }
+        }
+
+        return {Line, LinePos};
+    }
+
+private:
+    i32 Pos;
+    const i32 Size;
+    const TString& Text;
+
+private:
+    TOptimizerHints Hints;
+};
+
+TOptimizerHints TOptimizerHints::Parse(const TString& text) {
+    return TOptimizerHintsParser(text).Parse();
+}

+ 308 - 0
yql/essentials/core/cbo/cbo_optimizer_new.cpp

@@ -0,0 +1,308 @@
+#include "cbo_optimizer_new.h"
+
+#include <array>
+
+#include <util/string/builder.h>
+#include <util/generic/hash.h>
+#include <util/generic/hash_set.h>
+#include <util/string/cast.h>
+#include <util/string/join.h>
+#include <util/string/printf.h>
+
+const TString& ToString(NYql::EJoinKind);
+const TString& ToString(NYql::EJoinAlgoType);
+
+namespace NYql {
+
+using namespace NYql::NDq;
+
+namespace {
+
+    THashMap<TString,EJoinKind> JoinKindMap = {
+        {"Inner",EJoinKind::InnerJoin},
+        {"Left",EJoinKind::LeftJoin},
+        {"Right",EJoinKind::RightJoin},
+        {"Full",EJoinKind::OuterJoin},
+        {"LeftOnly",EJoinKind::LeftOnly},
+        {"RightOnly",EJoinKind::RightOnly},
+        {"Exclusion",EJoinKind::Exclusion},
+        {"LeftSemi",EJoinKind::LeftSemi},
+        {"RightSemi",EJoinKind::RightSemi},
+        {"Cross",EJoinKind::Cross}};
+
+    THashMap<TString,TCardinalityHints::ECardOperation> HintOpMap = {
+        {"+",TCardinalityHints::ECardOperation::Add},
+        {"-",TCardinalityHints::ECardOperation::Subtract},
+        {"*",TCardinalityHints::ECardOperation::Multiply},
+        {"/",TCardinalityHints::ECardOperation::Divide},
+        {"#",TCardinalityHints::ECardOperation::Replace}};
+
+}
+
+EJoinKind ConvertToJoinKind(const TString& joinString) {
+    auto maybeKind = JoinKindMap.find(joinString);
+    Y_ENSURE(maybeKind != JoinKindMap.end());
+
+    return maybeKind->second;
+}
+
+TString ConvertToJoinString(const EJoinKind kind) {
+    for (auto [k,v] : JoinKindMap) {
+        if (v == kind) {
+            return k;
+        }
+    }
+
+    Y_ENSURE(false,"Unknown join kind");
+}
+
+TVector<TString> TRelOptimizerNode::Labels()  {
+    TVector<TString> res;
+    res.emplace_back(Label);
+    return res;
+}
+
+void TRelOptimizerNode::Print(std::stringstream& stream, int ntabs) {
+    for (int i = 0; i < ntabs; i++){
+        stream << "    ";
+    }
+    stream << "Rel: " << Label << "\n";
+
+    for (int i = 0; i < ntabs; i++){
+        stream << "    ";
+    }
+    stream << Stats << "\n";
+}
+
+TJoinOptimizerNode::TJoinOptimizerNode(
+    const std::shared_ptr<IBaseOptimizerNode>& left, 
+    const std::shared_ptr<IBaseOptimizerNode>& right,
+    TVector<TJoinColumn> leftKeys,
+    TVector<TJoinColumn> rightKeys,
+    const EJoinKind joinType, 
+    const EJoinAlgoType joinAlgo, 
+    bool leftAny,
+    bool rightAny, 
+    bool nonReorderable
+)   : IBaseOptimizerNode(JoinNodeType)
+    , LeftArg(left)
+    , RightArg(right)
+    , LeftJoinKeys(leftKeys)
+    , RightJoinKeys(rightKeys)
+    , JoinType(joinType)
+    , JoinAlgo(joinAlgo)
+    , LeftAny(leftAny)
+    , RightAny(rightAny)
+    , IsReorderable(!nonReorderable)
+{}
+
+TVector<TString> TJoinOptimizerNode::Labels() {
+    auto res = LeftArg->Labels();
+    auto rightLabels = RightArg->Labels();
+    res.insert(res.begin(),rightLabels.begin(),rightLabels.end());
+    return res;
+}
+
+void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) {
+    for (int i = 0; i < ntabs; i++){
+        stream << "    ";
+    }
+
+    stream << "Join: (" << ToString(JoinType) << "," << ToString(JoinAlgo);
+    if (LeftAny) {
+        stream << ",LeftAny";
+    }
+    if (RightAny) {
+        stream << ",RightAny";
+    }
+    stream << ") ";
+
+    for (size_t i=0; i<LeftJoinKeys.size(); i++){
+        stream << LeftJoinKeys[i].RelName << "." << LeftJoinKeys[i].AttributeName
+            << "=" << RightJoinKeys[i].RelName << "."
+            << RightJoinKeys[i].AttributeName << ",";
+    }
+    stream << "\n";
+
+
+    for (int i = 0; i < ntabs; i++){
+        stream << "    ";
+    }
+    stream << Stats << "\n";
+    
+
+    LeftArg->Print(stream, ntabs+1);
+    RightArg->Print(stream, ntabs+1);
+}
+
+bool IsPKJoin(const TOptimizerStatistics& stats, const TVector<TJoinColumn>& joinKeys) {
+    if (!stats.KeyColumns) {
+        return false;
+    }
+
+    for(size_t i = 0; i < stats.KeyColumns->Data.size(); i++){
+        if (std::find_if(joinKeys.begin(), joinKeys.end(), 
+        [&] (const TJoinColumn& c) { return c.AttributeName == stats.KeyColumns->Data[i];}) == joinKeys.end()) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool TBaseProviderContext::IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left,
+    const std::shared_ptr<IBaseOptimizerNode>& right,
+    const TVector<TJoinColumn>& leftJoinKeys,
+    const TVector<TJoinColumn>& rightJoinKeys,
+    EJoinAlgoType joinAlgo,
+    EJoinKind joinKind) {
+
+    Y_UNUSED(left);
+    Y_UNUSED(right);
+    Y_UNUSED(leftJoinKeys);
+    Y_UNUSED(rightJoinKeys);
+    Y_UNUSED(joinKind);
+
+    return joinAlgo == EJoinAlgoType::MapJoin;
+}
+
+double TBaseProviderContext::ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const {
+    Y_UNUSED(outputByteSize);
+    Y_UNUSED(joinAlgo);
+    return leftStats.Nrows + 2.0 * rightStats.Nrows + outputRows;
+}
+
+/**
+ * Compute the cost and output cardinality of a join
+ *
+ * Currently a very basic computation targeted at GraceJoin
+ *
+ * The build is on the right side, so we make the build side a bit more expensive than the probe
+*/
+
+TOptimizerStatistics TBaseProviderContext::ComputeJoinStats(
+    const TOptimizerStatistics& leftStats,
+    const TOptimizerStatistics& rightStats,
+    const TVector<TJoinColumn>& leftJoinKeys,
+    const TVector<TJoinColumn>& rightJoinKeys,
+    EJoinAlgoType joinAlgo,
+    EJoinKind joinKind,
+    TCardinalityHints::TCardinalityHint* maybeHint) const
+{
+    double newCard{};
+    EStatisticsType outputType;
+    bool leftKeyColumns = false;
+    bool rightKeyColumns = false;
+    double selectivity = 1.0;
+
+    bool isRightPKJoin = IsPKJoin(rightStats,rightJoinKeys);
+    bool isLeftPKJoin = IsPKJoin(leftStats,leftJoinKeys);
+
+    if (isRightPKJoin && isLeftPKJoin) {
+        auto rightPKJoinCard = leftStats.Nrows * rightStats.Selectivity;
+        auto leftPKJoinCard = rightStats.Nrows * leftStats.Selectivity;
+        if (rightPKJoinCard > leftPKJoinCard) {
+            isRightPKJoin = false;
+        }
+    }
+
+    if (isRightPKJoin) {
+        switch (joinKind) {
+            case EJoinKind::LeftJoin:
+            case EJoinKind::LeftOnly:
+                newCard = leftStats.Nrows; break;
+            default: {
+                newCard = leftStats.Nrows * rightStats.Selectivity;
+            }
+        }
+
+        selectivity = leftStats.Selectivity * rightStats.Selectivity;
+        leftKeyColumns = true;
+        if (leftStats.Type == EStatisticsType::BaseTable){
+            outputType = EStatisticsType::FilteredFactTable;
+        } else {
+            outputType = leftStats.Type;
+        }
+    } else if (isLeftPKJoin) {
+        switch (joinKind) {
+            case EJoinKind::RightJoin:
+            case EJoinKind::RightOnly:
+                newCard = rightStats.Nrows; break;
+            default: {
+                newCard = leftStats.Selectivity * rightStats.Nrows;
+            }
+        }
+        
+        selectivity = leftStats.Selectivity * rightStats.Selectivity;
+        rightKeyColumns = true;
+        if (rightStats.Type == EStatisticsType::BaseTable){
+            outputType = EStatisticsType::FilteredFactTable;
+        } else {
+            outputType = rightStats.Type;
+        }
+    } else {
+        std::optional<double> lhsUniqueVals;
+        std::optional<double> rhsUniqueVals;
+        if (leftStats.ColumnStatistics && rightStats.ColumnStatistics && !leftJoinKeys.empty() && !rightJoinKeys.empty()) {
+            auto lhs = leftJoinKeys[0].AttributeName;
+            lhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals;
+            auto rhs = rightJoinKeys[0].AttributeName;
+            rightStats.ColumnStatistics->Data[rhs];
+            rhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals;
+        }
+
+        if (lhsUniqueVals.has_value() && rhsUniqueVals.has_value()) {
+            newCard = leftStats.Nrows * rightStats.Nrows / std::max(*lhsUniqueVals, *rhsUniqueVals);
+        } else {
+            newCard = 0.2 * leftStats.Nrows * rightStats.Nrows;
+        }
+
+        outputType = EStatisticsType::ManyManyJoin;
+    }
+
+    if (maybeHint) {
+        newCard = maybeHint->ApplyHint(newCard);
+    }
+
+    int newNCols = leftStats.Ncols + rightStats.Ncols;
+    double newByteSize = leftStats.Nrows ? (leftStats.ByteSize / leftStats.Nrows) * newCard : 0 +
+            rightStats.Nrows ? (rightStats.ByteSize / rightStats.Nrows) * newCard : 0;
+
+    double cost = ComputeJoinCost(leftStats, rightStats, newCard, newByteSize, joinAlgo)
+        + leftStats.Cost + rightStats.Cost;
+
+    auto result = TOptimizerStatistics(outputType, newCard, newNCols, newByteSize, cost,
+        leftKeyColumns ? leftStats.KeyColumns : ( rightKeyColumns ? rightStats.KeyColumns : TIntrusivePtr<TOptimizerStatistics::TKeyColumns>()));
+    result.Selectivity = selectivity;
+    return result;
+}
+
+const TBaseProviderContext& TBaseProviderContext::Instance() {
+    static TBaseProviderContext staticContext;
+    return staticContext;
+}
+
+TVector<TString> TOptimizerHints::GetUnappliedString() {
+    TVector<TString> res;
+
+    for (const auto& hint: JoinAlgoHints->Hints) {
+        if (!hint.Applied) {
+            res.push_back(hint.StringRepr);
+        }
+    }
+
+    for (const auto& hint: JoinOrderHints->Hints) {
+        if (!hint.Applied) {
+            res.push_back(hint.StringRepr);
+        }
+    }
+
+    for (const auto& hint: CardinalityHints->Hints) {
+        if (!hint.Applied) {
+            res.push_back(hint.StringRepr);
+        }
+    }
+
+    return res;
+}
+
+} // namespace NYql

+ 312 - 0
yql/essentials/core/cbo/cbo_optimizer_new.h

@@ -0,0 +1,312 @@
+#pragma once
+
+#include <util/generic/vector.h>
+#include <util/generic/string.h>
+#include <contrib/ydb/library/yql/core/yql_statistics.h>
+#include <contrib/ydb/library/yql/core/yql_cost_function.h>
+
+#include <unordered_map>
+#include <memory>
+#include <map>
+#include <sstream>
+
+namespace NYql {
+
+/**
+ * OptimizerNodes are the internal representations of operators inside the
+ * Cost-based optimizer. Currently we only support RelOptimizerNode - a node that
+ * is an input relation to the equi-join, and JoinOptimizerNode - an inner join
+ * that connects two sets of relations.
+*/
+enum EOptimizerNodeKind: ui32
+{
+    RelNodeType,
+    JoinNodeType
+};
+
+/**
+ * BaseOptimizerNode is a base class for the internal optimizer nodes
+ * It records a pointer to statistics and records the current cost of the
+ * operator tree, rooted at this node
+*/
+struct IBaseOptimizerNode {
+    EOptimizerNodeKind Kind;
+    TOptimizerStatistics Stats;
+
+    IBaseOptimizerNode(EOptimizerNodeKind k) : Kind(k) {}
+    IBaseOptimizerNode(EOptimizerNodeKind k, TOptimizerStatistics s) :
+        Kind(k), Stats(std::move(s)) {}
+
+    virtual TVector<TString> Labels()=0;
+    virtual void Print(std::stringstream& stream, int ntabs=0)=0;
+};
+
+enum EJoinKind: ui32
+{
+    InnerJoin,
+    LeftJoin,
+    RightJoin,
+    OuterJoin,
+    LeftOnly /* == LeftAntiJoin */,
+    RightOnly /* == RightAntiJoin */,
+    LeftSemi,
+    RightSemi,
+    Cross,
+    Exclusion
+};
+
+EJoinKind ConvertToJoinKind(const TString& joinString);
+TString ConvertToJoinString(const EJoinKind kind);
+
+struct TCardinalityHints {
+    enum ECardOperation : ui32 {
+        Add,
+        Subtract,
+        Multiply,
+        Divide,
+        Replace
+    };
+
+    struct TCardinalityHint {
+        TVector<TString> JoinLabels;
+        ECardOperation Operation;
+        double Value;
+        TString StringRepr;
+        bool Applied = false;
+
+        double ApplyHint(double originalValue) {
+            Applied = true;
+            
+            switch (Operation) {
+                case Add:
+                    return originalValue + Value;
+                case Subtract:
+                    return originalValue - Value;
+                case Multiply:
+                    return originalValue * Value;
+                case Divide:
+                    return originalValue / Value;
+                case Replace:
+                    return Value;
+            }
+        }
+    };
+
+    TVector<TCardinalityHint> Hints;
+    void PushBack(TVector<TString> labels, ECardOperation op, double value, TString stringRepr) {
+        Hints.push_back({.JoinLabels = std::move(labels), .Operation = op, .Value = value, .StringRepr = std::move(stringRepr)});
+    }
+};
+
+struct TJoinAlgoHints {
+    struct TJoinAlgoHint {
+        TVector<TString> JoinLabels;
+        EJoinAlgoType Algo;
+        TString StringRepr;
+        bool Applied = false;
+    };
+
+    TVector<TJoinAlgoHint> Hints;
+
+    void PushBack(TVector<TString> labels, EJoinAlgoType algo, TString stringRepr) {
+        Hints.push_back({.JoinLabels = std::move(labels), .Algo = algo, .StringRepr = std::move(stringRepr)});
+    }
+};
+
+struct TJoinOrderHints {
+    struct ITreeNode {
+        enum _ : ui32 {
+            Relation,
+            Join
+        };
+
+        virtual TVector<TString> Labels() = 0;
+        bool IsRelation() { return Type == Relation; }
+        bool IsJoin() { return Type == Join; } 
+        virtual ~ITreeNode() = default;
+
+        ui32 Type;
+    };
+
+    struct TJoinNode: public ITreeNode {
+        TJoinNode(std::shared_ptr<ITreeNode> lhs, std::shared_ptr<ITreeNode> rhs)
+            : Lhs(std::move(lhs))
+            , Rhs(std::move(rhs))
+        {
+            this->Type = ITreeNode::Join;
+        }
+
+        TVector<TString> Labels() override {     
+            auto labels = Lhs->Labels();
+            auto rhsLabels = Rhs->Labels();
+            labels.insert(labels.end(), std::make_move_iterator(rhsLabels.begin()), std::make_move_iterator(rhsLabels.end()));
+            return labels;
+        }
+
+        std::shared_ptr<ITreeNode> Lhs;
+        std::shared_ptr<ITreeNode> Rhs;
+    };
+
+    struct TRelationNode: public ITreeNode {
+        TRelationNode(TString label)
+            : Label(std::move(label))
+        {
+            this->Type = ITreeNode::Relation;
+        }
+
+        TVector<TString> Labels() override { return {Label}; }
+
+        TString Label;
+    };
+
+    struct TJoinOrderHint {
+        std::shared_ptr<ITreeNode> Tree;
+        TString StringRepr;
+        bool Applied = false;
+    };
+
+    TVector<TJoinOrderHint> Hints;
+
+    void PushBack(std::shared_ptr<ITreeNode> hintTree, TString stringRepr) {
+        Hints.push_back({.Tree = std::move(hintTree), .StringRepr = std::move(stringRepr)});
+    }
+};
+
+struct TOptimizerHints {
+    std::shared_ptr<TCardinalityHints> CardinalityHints = std::make_shared<TCardinalityHints>();
+    std::shared_ptr<TJoinAlgoHints> JoinAlgoHints = std::make_shared<TJoinAlgoHints>();
+    std::shared_ptr<TJoinOrderHints> JoinOrderHints = std::make_shared<TJoinOrderHints>();
+
+    TVector<TString> GetUnappliedString();
+
+    /* 
+     *   The function accepts string with three type of expressions: array of (JoinAlgo | Card | JoinOrder):
+     *   1) JoinAlgo(t1 t2 ... tn Map | Grace | Lookup) to change join algo for join, where these labels take part
+     *   2) Card(t1 t2 ... tn (*|/|+|-) Number) to change cardinality for join, where these labels take part or labels only
+     *   3) JoinOrder( (t1 t2) (t3 (t4 ...)) ) - fixate this join subtree in the general join tree
+     */  
+    static TOptimizerHints Parse(const TString&);
+};
+
+/**
+ * This is a temporary structure for KQP provider
+ * We will soon be supporting multiple providers and we will need to design
+ * some interfaces to pass provider-specific context to the optimizer
+*/
+struct IProviderContext {
+    virtual ~IProviderContext() = default;
+
+    virtual double ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgol) const = 0;
+
+    virtual TOptimizerStatistics ComputeJoinStats(
+        const TOptimizerStatistics& leftStats,
+        const TOptimizerStatistics& rightStats,
+        const TVector<NDq::TJoinColumn>& leftJoinKeys,
+        const TVector<NDq::TJoinColumn>& rightJoinKeys,
+        EJoinAlgoType joinAlgo,
+        EJoinKind joinKind,
+        TCardinalityHints::TCardinalityHint* maybeHint = nullptr) const = 0;
+
+    virtual bool IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left,
+        const std::shared_ptr<IBaseOptimizerNode>& right,
+        const TVector<NDq::TJoinColumn>& leftJoinKeys,
+        const TVector<NDq::TJoinColumn>& rightJoinKeys,
+        EJoinAlgoType joinAlgo,
+        EJoinKind joinKin) = 0;
+};
+
+/**
+ * Default provider context with default cost and stats computation.
+*/
+
+struct TBaseProviderContext : public IProviderContext {
+    TBaseProviderContext() {}
+
+    double ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const override;
+
+    bool IsJoinApplicable(
+        const std::shared_ptr<IBaseOptimizerNode>& leftStats,
+        const std::shared_ptr<IBaseOptimizerNode>& rightStats,
+        const TVector<NDq::TJoinColumn>& leftJoinKeys,
+        const TVector<NDq::TJoinColumn>& rightJoinKeys,
+        EJoinAlgoType joinAlgo,
+        EJoinKind joinKind) override;
+
+    virtual TOptimizerStatistics ComputeJoinStats(
+        const TOptimizerStatistics& leftStats,
+        const TOptimizerStatistics& rightStats,
+        const TVector<NDq::TJoinColumn>& leftJoinKeys,
+        const TVector<NDq::TJoinColumn>& rightJoinKeys,
+        EJoinAlgoType joinAlgo,
+        EJoinKind joinKind,
+        TCardinalityHints::TCardinalityHint* maybeHint = nullptr) const override;
+
+    static const TBaseProviderContext& Instance();
+};
+
+/**
+ * RelOptimizerNode adds a label to base class
+ * This is the label assinged to the input by equi-Join
+*/
+struct TRelOptimizerNode : public IBaseOptimizerNode {
+    TString Label;
+
+    // Temporary solution to check if a LookupJoin is possible in KQP
+    //void* Expr;
+
+    TRelOptimizerNode(TString label, TOptimizerStatistics stats) :
+        IBaseOptimizerNode(RelNodeType, std::move(stats)), Label(label) { }
+    //TRelOptimizerNode(TString label, std::shared_ptr<TOptimizerStatistics> stats, const TExprNode::TPtr expr) :
+    //    IBaseOptimizerNode(RelNodeType, stats), Label(label), Expr(expr) { }
+    virtual ~TRelOptimizerNode() {}
+
+    virtual TVector<TString> Labels();
+    virtual void Print(std::stringstream& stream, int ntabs=0);
+};
+
+/**
+ * JoinOptimizerNode records the left and right arguments of the join
+ * as well as the set of join conditions.
+ * It also has methods to compute the statistics and cost of a join,
+ * based on pre-computed costs and statistics of the children.
+*/
+struct TJoinOptimizerNode : public IBaseOptimizerNode {
+    std::shared_ptr<IBaseOptimizerNode> LeftArg;
+    std::shared_ptr<IBaseOptimizerNode> RightArg;
+    TVector<NDq::TJoinColumn> LeftJoinKeys;
+    TVector<NDq::TJoinColumn> RightJoinKeys;
+    EJoinKind JoinType;
+    EJoinAlgoType JoinAlgo;
+    /////////////////// 'ANY' flag means leaving only one row from the join side.
+    bool LeftAny;
+    bool RightAny;
+    ///////////////////
+    bool IsReorderable;
+
+    TJoinOptimizerNode(const std::shared_ptr<IBaseOptimizerNode>& left,
+        const std::shared_ptr<IBaseOptimizerNode>& right,
+        TVector<NDq::TJoinColumn> leftKeys,
+        TVector<NDq::TJoinColumn> rightKeys,
+        const EJoinKind joinType,
+        const EJoinAlgoType joinAlgo,
+        bool leftAny,
+        bool rightAny,
+        bool nonReorderable = false
+    );
+    virtual ~TJoinOptimizerNode() {}
+    virtual TVector<TString> Labels();
+    virtual void Print(std::stringstream& stream, int ntabs=0);
+};
+
+struct IOptimizerNew {
+    IProviderContext& Pctx;
+
+    IOptimizerNew(IProviderContext& ctx) : Pctx(ctx) {}
+    virtual ~IOptimizerNew() = default;
+    virtual std::shared_ptr<TJoinOptimizerNode> JoinSearch(
+        const std::shared_ptr<TJoinOptimizerNode>& joinTree, 
+        const TOptimizerHints& hints = {}
+    ) = 0;
+};
+
+} // namespace NYql

+ 99 - 0
yql/essentials/core/cbo/cbo_optimizer_ut.cpp

@@ -0,0 +1,99 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/testing/hook/hook.h>
+
+#include <yql/essentials/parser/pg_wrapper/interface/optimizer.h>
+
+using namespace NYql;
+
+Y_UNIT_TEST_SUITE(CboOptimizer) {
+
+Y_UNIT_TEST(InputToString) {
+    IOptimizer::TRel rel1 = {100000, 1000000, {{'a'}}};
+    IOptimizer::TRel rel2 = {1000000, 9000009, {{'b'}}};
+    IOptimizer::TRel rel3 = {10000, 9009, {{'c'}}};
+    IOptimizer::TInput input = {{rel1, rel2, rel3}, {}, {}, {}};
+
+    input.EqClasses.emplace_back(IOptimizer::TEq {
+        {{1, 1}, {2, 1}, {3, 1}}
+    });
+
+    auto str = input.ToString();
+
+    TString expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]}]
+EqClasses: [[a,b,c]]
+)__";
+    UNIT_ASSERT_STRINGS_EQUAL(expected, str);
+}
+
+Y_UNIT_TEST(OutputToString) {
+    IOptimizer::TOutput output;
+    auto str = output.ToString();
+
+    TString expected = R"__(Rows: 0.00
+TotalCost: 0.00
+{
+}
+)__";
+    UNIT_ASSERT_STRINGS_EQUAL(expected, str);
+}
+
+Y_UNIT_TEST(InputNormalize) {
+    IOptimizer::TRel rel1 = {100000, 1000000, {{'a'}}};
+    IOptimizer::TRel rel2 = {1000000, 9000009, {{'b'}}};
+    IOptimizer::TRel rel3 = {10000, 9009, {{'c'}}};
+    IOptimizer::TInput input = {{rel1, rel2, rel3}, {}, {}, {}};
+
+    input.EqClasses.emplace_back(IOptimizer::TEq {
+        {{1, 1}, {2, 1}}
+    });
+    input.EqClasses.emplace_back(IOptimizer::TEq {
+        {{2, 1}, {3, 1}}
+    });
+
+    TString expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]}]
+EqClasses: [[a,b],[b,c]]
+)__";
+    UNIT_ASSERT_STRINGS_EQUAL(expected, input.ToString());
+
+    input.Normalize();
+
+    expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]}]
+EqClasses: [[a,b,c]]
+)__";
+    UNIT_ASSERT_STRINGS_EQUAL(expected, input.ToString());
+
+    IOptimizer::TRel rel4 = {10001, 9009, {{'d'}}};
+    IOptimizer::TInput input2 = {{rel1, rel2, rel3, rel4}, {}, {}, {}};
+    input2.EqClasses.emplace_back(IOptimizer::TEq {
+        {{1, 1}, {2, 1}}
+    });
+    input2.EqClasses.emplace_back(IOptimizer::TEq {
+        {{4, 1}, {3, 1}}
+    });
+
+    expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]},
+{rows: 10001,cost: 9009,vars: [d]}]
+EqClasses: [[a,b],[d,c]]
+)__";
+    UNIT_ASSERT_STRINGS_EQUAL(expected, input2.ToString());
+
+    input2.Normalize();
+
+    expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]},
+{rows: 10001,cost: 9009,vars: [d]}]
+EqClasses: [[a,b],[c,d]]
+)__";
+    UNIT_ASSERT_STRINGS_EQUAL(expected, input2.ToString());
+}
+
+} // Y_UNIT_TEST_SUITE(CboOptimizer)

+ 15 - 0
yql/essentials/core/cbo/ut/ya.make

@@ -0,0 +1,15 @@
+UNITTEST_FOR(yql/essentials/core/cbo)
+
+SRCS(
+    cbo_optimizer_ut.cpp
+)
+
+PEERDIR(
+    yql/essentials/core/cbo
+    yql/essentials/parser/pg_wrapper/interface
+    yql/essentials/public/udf/service/stub
+)
+
+SIZE(SMALL)
+
+END()

+ 15 - 0
yql/essentials/core/cbo/ya.make

@@ -0,0 +1,15 @@
+LIBRARY()
+
+SRCS(
+    cbo_optimizer_new.cpp
+    cbo_hints.cpp
+)
+
+GENERATE_ENUM_SERIALIZATION(cbo_optimizer_new.h)
+
+END()
+
+RECURSE_FOR_TESTS(
+    ut
+)
+

+ 1 - 0
yql/essentials/core/ya.make

@@ -1,4 +1,5 @@
 RECURSE(
+    cbo
     credentials
     file_storage
     issue

+ 388 - 0
yql/essentials/minikql/dom/convert.h

@@ -0,0 +1,388 @@
+#pragma once
+
+#include <yql/essentials/public/udf/udf_value.h>
+#include <yql/essentials/public/udf/udf_value_builder.h>
+#include <yql/essentials/utils/utf8.h>
+
+#include <util/string/escape.h>
+#include <util/string/cast.h>
+#include <util/string/builder.h>
+
+#include <functional>
+
+namespace NYql::NDom {
+
+template<bool Strict, bool AutoConvert>
+TUnboxedValuePod ConvertToBool(TUnboxedValuePod x, const IValueBuilder* valueBuilder, const TSourcePosition& pos) {
+    switch (GetNodeType(x)) {
+        case ENodeType::Bool:
+            return TUnboxedValuePod(x.Get<bool>());
+        case ENodeType::String:
+            if (const std::string_view str = x.AsStringRef(); str == "true")
+                return TUnboxedValuePod(true);
+            else if (str == "false")
+                return TUnboxedValuePod(false);
+            else if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.AsStringRef().Size() > 0U);
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Uint64:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.Get<ui64>() != 0ULL);
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Int64:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.Get<i64>() != 0LL);
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Double:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.Get<double>() != 0.);
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Entity:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(false);
+            else if constexpr (Strict)
+                break;
+            else if constexpr (AutoConvert)
+                return TUnboxedValuePod(false);
+            else
+                return {};
+        case ENodeType::List:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.IsBoxed() && x.HasListItems());
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Dict:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.IsBoxed() && x.HasDictItems());
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Attr:
+            return ConvertToBool<Strict, AutoConvert>(x.GetVariantItem().Release(), valueBuilder, pos);
+    }
+
+    UdfTerminate((::TStringBuilder() << valueBuilder->WithCalleePosition(pos) << " Cannot parse boolean value from " << TDebugPrinter(x)).c_str());
+}
+
+template<typename TDst, typename TSrc>
+constexpr inline bool InBounds(const TSrc v) {
+    if constexpr (std::is_same<TSrc, TDst>())
+        return true;
+    if constexpr (sizeof(TSrc) > sizeof(TDst))
+        if constexpr (std::is_signed<TSrc>())
+            return v <= TSrc(std::numeric_limits<TDst>::max()) && v >= TSrc(std::numeric_limits<TDst>::min());
+        else
+            return v <= TSrc(std::numeric_limits<TDst>::max());
+    else
+        if constexpr (std::is_signed<TSrc>())
+            return v >= TSrc(std::numeric_limits<TDst>::min());
+        else
+            return v <= TSrc(std::numeric_limits<TDst>::max());
+    static_assert(sizeof(TSrc) >= sizeof(TDst), "Expects wide to short.");
+}
+
+template<bool Strict, bool AutoConvert, typename TargetType>
+TUnboxedValuePod ConvertToIntegral(TUnboxedValuePod x, const IValueBuilder* valueBuilder, const TSourcePosition& pos) {
+    switch (GetNodeType(x)) {
+        case ENodeType::Int64: {
+            const auto s = x.Get<i64>();
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(s));
+            else if (InBounds<TargetType>(s))
+                return TUnboxedValuePod(TargetType(s));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        }
+        case ENodeType::Uint64: {
+            const auto u = x.Get<ui64>();
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(u));
+            else if (InBounds<TargetType>(u))
+                return TUnboxedValuePod(TargetType(u));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        }
+        case ENodeType::Bool:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(x.Get<bool>() ? 1 : 0));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Double:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(x.Get<double>()));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::String:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(FromStringWithDefault(std::string_view(x.AsStringRef()), TargetType(0)));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Entity:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod::Zero();
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::List:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod::Zero();
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Dict:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod::Zero();
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Attr:
+            return ConvertToIntegral<Strict, AutoConvert, TargetType>(x.GetVariantItem().Release(), valueBuilder, pos);
+    }
+
+    UdfTerminate((::TStringBuilder() << valueBuilder->WithCalleePosition(pos) << " Cannot parse integer value from " << TDebugPrinter(x)).c_str());
+    static_assert(std::is_integral<TargetType>(), "Expect integral.");
+}
+
+template<bool Strict, bool AutoConvert, typename TargetType>
+TUnboxedValuePod ConvertToFloat(TUnboxedValuePod x, const IValueBuilder* valueBuilder, const TSourcePosition& pos) {
+    switch (GetNodeType(x)) {
+        case ENodeType::Double:
+            return TUnboxedValuePod(TargetType(x.Get<double>()));
+        case ENodeType::Uint64:
+            return TUnboxedValuePod(TargetType(x.Get<ui64>()));
+        case ENodeType::Int64:
+            return TUnboxedValuePod(TargetType(x.Get<i64>()));
+        case ENodeType::Bool:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(x.Get<bool>() ? TargetType(1) : TargetType(0));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::String:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(FromStringWithDefault(std::string_view(x.AsStringRef()), TargetType(0)));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Entity:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(0));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::List:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(0));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Dict:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod(TargetType(0));
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Attr:
+            return ConvertToFloat<Strict, AutoConvert, TargetType>(x.GetVariantItem().Release(), valueBuilder, pos);
+    }
+
+    UdfTerminate((::TStringBuilder() << valueBuilder->WithCalleePosition(pos) << " Cannot parse floating point value from " << TDebugPrinter(x)).c_str());
+    static_assert(std::is_floating_point<TargetType>(), "Expect float.");
+}
+
+template<bool Strict, bool AutoConvert, bool Utf8>
+TUnboxedValuePod ConvertToString(TUnboxedValuePod x, const IValueBuilder* valueBuilder, const TSourcePosition& pos) {
+    switch (GetNodeType(x)) {
+        case ENodeType::String:
+            if constexpr (Utf8)
+                if (IsUtf8(x.AsStringRef()))
+                    return x;
+                else
+                    if (AutoConvert)
+                        return valueBuilder->NewString(EscapeC(TStringBuf(x.AsStringRef()))).Release();
+                    else if constexpr (Strict)
+                        break;
+                    else
+                        return {};
+            else
+                return x;
+        case ENodeType::Uint64:
+            if constexpr (AutoConvert)
+                return valueBuilder->NewString(ToString(x.Get<ui64>())).Release();
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Int64:
+            if constexpr (AutoConvert)
+                return valueBuilder->NewString(ToString(x.Get<i64>())).Release();
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Bool:
+            if constexpr (AutoConvert)
+                return x.Get<bool>() ? TUnboxedValuePod::Embedded("true") : TUnboxedValuePod::Embedded("false");
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Double:
+            if constexpr (AutoConvert)
+                return valueBuilder->NewString(::FloatToString(x.Get<double>())).Release();
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Entity:
+        case ENodeType::List:
+        case ENodeType::Dict:
+            if constexpr (AutoConvert)
+                return TUnboxedValuePod::Embedded("");
+            else if constexpr (Strict)
+                break;
+            else
+                return {};
+        case ENodeType::Attr:
+            return ConvertToString<Strict, AutoConvert, Utf8>(x.GetVariantItem().Release(), valueBuilder, pos);
+    }
+
+    UdfTerminate((::TStringBuilder() << valueBuilder->WithCalleePosition(pos) << " Cannot parse string value from " << TDebugPrinter(x)).c_str());
+}
+
+class TLazyConveter : public TManagedBoxedValue {
+public:
+    using TConverter = std::function<TUnboxedValuePod(TUnboxedValuePod)>;
+
+    TLazyConveter(TUnboxedValue&& original, TConverter&& converter)
+        : Original(std::move(original)), Converter(std::move(converter))
+    {}
+private:
+    template <bool NoSwap>
+    class TIterator: public TManagedBoxedValue {
+    public:
+        TIterator(TUnboxedValue&& original, const TConverter& converter)
+            : Original(std::move(original)), Converter(converter)
+        {}
+
+    private:
+        bool Skip() final {
+            return Original.Skip();
+        }
+
+        bool Next(TUnboxedValue& value) final {
+            if (Original.Next(value)) {
+                if constexpr (!NoSwap) {
+                    value = Converter(value.Release());
+                }
+                return true;
+            }
+            return false;
+        }
+
+        bool NextPair(TUnboxedValue& key, TUnboxedValue& payload) final {
+            if (Original.NextPair(key, payload)) {
+                if constexpr (NoSwap) {
+                    payload = Converter(payload.Release());
+                } else {
+                    key = Converter(key.Release());
+                }
+                return true;
+            }
+            return false;
+        }
+
+        const TUnboxedValue Original;
+        const TConverter Converter;
+    };
+
+    ui64 GetDictLength() const final {
+        return Original.GetDictLength();
+    }
+
+    ui64 GetListLength() const final {
+        return Original.GetListLength();
+    }
+
+    bool HasFastListLength() const final {
+        return Original.HasFastListLength();
+    }
+
+    bool HasDictItems() const final {
+        return Original.HasDictItems();
+    }
+
+    bool HasListItems() const final {
+        return Original.HasListItems();
+    }
+
+    TUnboxedValue GetListIterator() const final {
+        return TUnboxedValuePod(new TIterator<false>(Original.GetListIterator(), Converter));
+    }
+
+    TUnboxedValue GetDictIterator() const final {
+        return TUnboxedValuePod(new TIterator<true>(Original.GetDictIterator(), Converter));
+    }
+
+    TUnboxedValue GetKeysIterator() const final {
+        return TUnboxedValuePod(new TIterator<true>(Original.GetKeysIterator(), Converter));
+    }
+
+    TUnboxedValue GetPayloadsIterator() const {
+        return TUnboxedValuePod(new TIterator<false>(Original.GetPayloadsIterator(), Converter));
+    }
+
+    bool Contains(const TUnboxedValuePod& key) const final {
+        return Original.Contains(key);
+    }
+
+    TUnboxedValue Lookup(const TUnboxedValuePod& key) const final {
+        if (auto lookup = Original.Lookup(key)) {
+            return Converter(lookup.Release().GetOptionalValue()).MakeOptional();
+        }
+        return {};
+    }
+
+    bool IsSortedDict() const final {
+        return Original.IsSortedDict();
+    }
+
+private:
+    const TUnboxedValue Original;
+    const TConverter Converter;
+};
+
+}

+ 151 - 0
yql/essentials/minikql/dom/hash.cpp

@@ -0,0 +1,151 @@
+#include "node.h"
+#include "hash.h"
+
+#include <yql/essentials/public/udf/udf_type_ops.h>
+
+namespace NYql::NDom {
+
+using namespace NUdf;
+
+namespace {
+
+THashType HashList(const NUdf::TUnboxedValuePod x) {
+    THashType hash = 0ULL;
+    if (x.IsBoxed()) {
+        if (const auto elements = x.GetElements()) {
+            const auto size = x.GetListLength();
+            for (ui32 i = 0U; i < size; ++i) {
+                hash = CombineHashes(hash, HashDom(elements[i]));
+            }
+        } else {
+            const auto it = x.GetListIterator();
+            for (TUnboxedValue v; it.Next(v); hash = CombineHashes(hash, HashDom(v)))
+                continue;
+        }
+    }
+    return hash;
+}
+
+THashType HashDict(const NUdf::TUnboxedValuePod x) {
+    THashType hash = 0ULL;
+    if (x.IsBoxed()) {
+        const auto it = x.GetDictIterator();
+        for (TUnboxedValue k, v; it.NextPair(k, v);) {
+            hash = CombineHashes(hash, CombineHashes(GetStringHash(k), HashDom(v)));
+        }
+    }
+    return hash;
+}
+
+bool EquateLists(const NUdf::TUnboxedValuePod x, const NUdf::TUnboxedValuePod y) {
+    if (x.IsBoxed() && y.IsBoxed()) {
+        const auto ex = x.GetElements();
+        const auto ey = y.GetElements();
+        if (ex && ey) {
+            const auto size = x.GetListLength();
+            if (size != y.GetListLength()) {
+                return false;
+            }
+            for (ui32 i = 0U; i < size; ++i) {
+                if (!EquateDoms(ex[i], ey[i]))
+                    return false;
+            }
+        } else {
+            const auto itx = x.GetListIterator();
+            const auto ity = y.GetListIterator();
+            for (TUnboxedValue vx, vy; itx.Next(vx);) {
+                if (!ity.Next(vy))
+                    return false;
+                if (!EquateDoms(vx, vy))
+                    return false;
+            }
+        }
+        return true;
+    }
+    return x.IsBoxed() == y.IsBoxed();
+}
+
+bool EquateDicts(const NUdf::TUnboxedValuePod x, const NUdf::TUnboxedValuePod y) {
+    if (x.IsBoxed() && y.IsBoxed()) {
+        const auto size = x.GetDictLength();
+        if (size != y.GetDictLength()) {
+            return false;
+        }
+
+        const auto xr =  static_cast<const TPair*>(x.GetResource());
+        const auto yr =  static_cast<const TPair*>(y.GetResource());
+        // clone dict as attrnode
+        if (xr && yr) {
+            for (ui32 i = 0U; i < size; ++i) {
+                if (!EquateStrings(xr[i].first, yr[i].first))
+                    return false;
+                if (!EquateDoms(xr[i].second, yr[i].second))
+                    return false;
+            }
+        } else {
+            const auto it = x.GetDictIterator();
+            for (TUnboxedValue k, v; it.NextPair(k, v);) {
+                if (auto l = y.Lookup(k))
+                    if (EquateDoms(v, l.GetOptionalValue()))
+                        continue;
+                return false;
+            }
+
+        }
+        return true;
+    }
+    return x.IsBoxed() == y.IsBoxed();
+}
+
+}
+
+THashType HashDom(const NUdf::TUnboxedValuePod x) {
+    switch (const auto type = GetNodeType(x); type) {
+        case ENodeType::Double:
+            return CombineHashes(THashType(type), GetFloatHash<double>(x));
+        case ENodeType::Uint64:
+            return CombineHashes(THashType(type), GetIntegerHash<ui64>(x));
+        case ENodeType::Int64:
+            return CombineHashes(THashType(type), GetIntegerHash<i64>(x));
+        case ENodeType::Bool:
+            return CombineHashes(THashType(type), std::hash<bool>()(x.Get<bool>()));
+        case ENodeType::String:
+            return CombineHashes(THashType(type), GetStringHash(x));
+        case ENodeType::Entity:
+            return CombineHashes(THashType(type), THashType(~0ULL));
+        case ENodeType::List:
+            return CombineHashes(THashType(type), HashList(x));
+        case ENodeType::Dict:
+            return CombineHashes(THashType(type), HashDict(x));
+        case ENodeType::Attr:
+            return CombineHashes(THashType(type), CombineHashes(HashDict(x), HashDom(x.GetVariantItem().Release())));
+    }
+}
+
+bool EquateDoms(const NUdf::TUnboxedValuePod x, const NUdf::TUnboxedValuePod y) {
+    if (const auto type = GetNodeType(x); type == GetNodeType(y)) {
+        switch (type) {
+            case ENodeType::Double:
+                return EquateFloats<double>(x, y);
+            case ENodeType::Uint64:
+                return EquateIntegers<ui64>(x, y);
+            case ENodeType::Int64:
+                return EquateIntegers<i64>(x, y);
+            case ENodeType::Bool:
+                return x.Get<bool>() == y.Get<bool>();
+            case ENodeType::String:
+                return EquateStrings(x, y);
+            case ENodeType::Entity:
+                return true;
+            case ENodeType::List:
+                return EquateLists(x, y);
+            case ENodeType::Dict:
+                return EquateDicts(x, y);
+            case ENodeType::Attr:
+                return EquateDicts(x, y) && EquateDoms(x.GetVariantItem().Release(), y.GetVariantItem().Release());
+        }
+    }
+    return false;
+}
+
+}

+ 13 - 0
yql/essentials/minikql/dom/hash.h

@@ -0,0 +1,13 @@
+#pragma once
+
+#include <yql/essentials/public/udf/udf_types.h>
+#include <yql/essentials/public/udf/udf_type_ops.h>
+
+namespace NYql::NDom {
+
+NUdf::THashType HashDom(const NUdf::TUnboxedValuePod value);
+
+bool EquateDoms(const NUdf::TUnboxedValuePod lhs, const NUdf::TUnboxedValuePod rhs);
+
+}
+

Some files were not shown because too many files changed in this diff