Просмотр исходного кода

First version of cost based optimizer

Added feature flags and moved debug output to logger

Working version, moved stats to type ann

Updated CBO

Initial commit of CBO
pavelvelikhov 1 год назад
Родитель
Сommit
46501e2a1a

+ 3 - 0
ydb/core/kqp/host/kqp_runner.cpp

@@ -4,6 +4,8 @@
 #include <ydb/core/kqp/query_compiler/kqp_query_compiler.h>
 #include <ydb/core/kqp/opt/kqp_opt.h>
 #include <ydb/core/kqp/opt/logical/kqp_opt_log.h>
+#include <ydb/core/kqp/opt/kqp_statistics_transformer.h>
+
 #include <ydb/core/kqp/opt/physical/kqp_opt_phy.h>
 #include <ydb/core/kqp/opt/peephole/kqp_opt_peephole.h>
 #include <ydb/core/kqp/opt/kqp_query_plan.h>
@@ -89,6 +91,7 @@ public:
             .Add(CreateKqpCheckQueryTransformer(), "CheckKqlQuery")
             .AddPostTypeAnnotation(/* forSubgraph */ true)
             .AddCommonOptimization()
+            .Add(CreateKqpStatisticsTransformer(*typesCtx, Config), "Statistics")
             .Add(CreateKqpLogOptTransformer(OptimizeCtx, *typesCtx, Config), "LogicalOptimize")
             .Add(CreateLogicalDataProposalsInspector(*typesCtx), "ProvidersLogicalOptimize")
             .Add(CreateKqpPhyOptTransformer(OptimizeCtx, *typesCtx), "KqpPhysicalOptimize")

+ 1 - 0
ydb/core/kqp/opt/CMakeLists.darwin-x86_64.txt

@@ -44,6 +44,7 @@ target_sources(core-kqp-opt PRIVATE
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_opt_range_legacy.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_blocks_transformer.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
 )
 generate_enum_serilization(core-kqp-opt
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.h

+ 1 - 0
ydb/core/kqp/opt/CMakeLists.linux-aarch64.txt

@@ -45,6 +45,7 @@ target_sources(core-kqp-opt PRIVATE
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_opt_range_legacy.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_blocks_transformer.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
 )
 generate_enum_serilization(core-kqp-opt
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.h

+ 1 - 0
ydb/core/kqp/opt/CMakeLists.linux-x86_64.txt

@@ -45,6 +45,7 @@ target_sources(core-kqp-opt PRIVATE
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_opt_range_legacy.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_blocks_transformer.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
 )
 generate_enum_serilization(core-kqp-opt
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.h

+ 1 - 0
ydb/core/kqp/opt/CMakeLists.windows-x86_64.txt

@@ -44,6 +44,7 @@ target_sources(core-kqp-opt PRIVATE
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_opt_range_legacy.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_blocks_transformer.cpp
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.cpp
+  ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
 )
 generate_enum_serilization(core-kqp-opt
   ${CMAKE_SOURCE_DIR}/ydb/core/kqp/opt/kqp_query_plan.h

+ 157 - 0
ydb/core/kqp/opt/kqp_statistics_transformer.cpp

@@ -0,0 +1,157 @@
+#include "kqp_statistics_transformer.h"
+#include <ydb/library/yql/utils/log/log.h>
+
+
+using namespace NYql;
+using namespace NYql::NNodes;
+using namespace NKikimr::NKqp;
+
+namespace {
+
+/**
+ * Helper method to fetch statistics from type annotation context
+*/
+std::shared_ptr<TOptimizerStatistics> GetStats( const TExprNode* input, TTypeAnnotationContext* typeCtx ) {
+
+    return typeCtx->StatisticsMap.Value(input, std::shared_ptr<TOptimizerStatistics>(nullptr));
+}
+
+/**
+ * Helper method to set statistics in type annotation context
+*/
+void SetStats( const TExprNode* input, TTypeAnnotationContext* typeCtx, std::shared_ptr<TOptimizerStatistics> stats ) {
+
+    typeCtx->StatisticsMap[input] = stats;
+}
+
+/**
+ * Helper method to get cost from type annotation context
+ * Doesn't check if the cost is in the mapping
+*/
+std::optional<double> GetCost( const TExprNode* input, TTypeAnnotationContext* typeCtx ) {
+    return typeCtx->StatisticsMap[input]->Cost;
+}
+
+/**
+ * Helper method to set the cost in type annotation context
+*/
+void SetCost( const TExprNode* input, TTypeAnnotationContext* typeCtx, std::optional<double> cost ) {
+    typeCtx->StatisticsMap[input]->Cost = cost;
+}
+}
+
+/**
+ * For Flatmap we check the input and fetch the statistcs and cost from below
+ * Then we analyze the filter predicate and compute it's selectivity and apply it
+ * to the result.
+*/
+void InferStatisticsForFlatMap(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) {
+
+    auto inputNode = TExprBase(input);
+    auto flatmap = inputNode.Cast<TCoFlatMap>();
+    if (!IsPredicateFlatMap(flatmap.Lambda().Body().Ref())) {
+        return;
+    }
+
+    auto flatmapInput = flatmap.Input();
+    auto inputStats = GetStats(flatmapInput.Raw(), typeCtx);
+
+    if (! inputStats ) {
+        return;
+    }
+
+    // Selectivity is the fraction of tuples that are selected by this predicate
+    // Currently we just set the number to 10% before we have statistics and parse
+    // the predicate
+    double selectivity = 0.1;
+
+    auto outputStats = TOptimizerStatistics(inputStats->Nrows * selectivity, inputStats->Ncols);
+
+    SetStats(input.Get(), typeCtx, std::make_shared<TOptimizerStatistics>(outputStats) );
+    SetCost(input.Get(), typeCtx, GetCost(flatmapInput.Raw(), typeCtx));
+}
+
+/**
+ * Infer statistics and costs for SkipNullMembers
+ * We don't have a good idea at this time how many nulls will be discarded, so we just return the
+ * input statistics.
+*/
+void InferStatisticsForSkipNullMembers(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) {
+
+    auto inputNode = TExprBase(input);
+    auto skipNullMembers = inputNode.Cast<TCoSkipNullMembers>();
+    auto skipNullMembersInput = skipNullMembers.Input();
+
+    auto inputStats = GetStats(skipNullMembersInput.Raw(), typeCtx);
+    if (!inputStats) {
+        return;
+    }
+
+    SetStats( input.Get(), typeCtx, inputStats );
+    SetCost( input.Get(), typeCtx, GetCost( skipNullMembersInput.Raw(), typeCtx ) );
+}
+
+/**
+ * Compute statistics and cost for read table
+ * Currently we just make up a number for the cardinality (100000) and set cost to 0
+*/
+void InferStatisticsForReadTable(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) {
+
+    YQL_CLOG(TRACE, CoreDq) << "Infer statistics for read table";
+
+    auto outputStats = TOptimizerStatistics(100000, 5, 0.0);
+    SetStats( input.Get(), typeCtx, std::make_shared<TOptimizerStatistics>(outputStats) );
+}
+
+/**
+ * Compute sstatistics for index lookup
+ * Currently we just make up a number for cardinality (5) and set cost to 0
+*/
+void InferStatisticsForIndexLookup(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) {
+
+    auto outputStats = TOptimizerStatistics(5, 5, 0.0);
+    SetStats( input.Get(), typeCtx, std::make_shared<TOptimizerStatistics>(outputStats) );
+}
+
+/**
+ * DoTransform method matches operators and callables in the query DAG and
+ * uses pre-computed statistics and costs of the children to compute their cost.
+*/
+IGraphTransformer::TStatus TKqpStatisticsTransformer::DoTransform(TExprNode::TPtr input, 
+    TExprNode::TPtr& output, TExprContext& ctx) {
+
+    output = input;
+    if (!Config->HasOptEnableCostBasedOptimization()) {
+        return IGraphTransformer::TStatus::Ok;
+    }
+          
+    TOptimizeExprSettings settings(nullptr);
+
+    auto ret = OptimizeExpr(input, output, [*this](const TExprNode::TPtr& input, TExprContext& ctx) {
+        Y_UNUSED(ctx);
+        auto output = input;
+
+        if (TCoFlatMap::Match(input.Get())){
+            InferStatisticsForFlatMap(input, typeCtx);
+        }
+        else if(TCoSkipNullMembers::Match(input.Get())){
+            InferStatisticsForSkipNullMembers(input, typeCtx);
+        }
+        else if(TKqlReadTableBase::Match(input.Get()) || TKqlReadTableRangesBase::Match(input.Get())){
+            InferStatisticsForReadTable(input, typeCtx);
+        }
+        else if(TKqlLookupTableBase::Match(input.Get()) || TKqlLookupIndexBase::Match(input.Get())){
+            InferStatisticsForIndexLookup(input, typeCtx);
+        }
+
+        return output;
+    }, ctx, settings);
+
+    return ret;
+}
+
+TAutoPtr<IGraphTransformer> NKikimr::NKqp::CreateKqpStatisticsTransformer(TTypeAnnotationContext& typeCtx, 
+    const TKikimrConfiguration::TPtr& config) {
+
+    return THolder<IGraphTransformer>(new TKqpStatisticsTransformer(typeCtx, config));
+}

+ 45 - 0
ydb/core/kqp/opt/kqp_statistics_transformer.h

@@ -0,0 +1,45 @@
+#pragma once
+
+#include <ydb/library/yql/core/yql_statistics.h>
+
+#include <ydb/core/kqp/common/kqp_yql.h>
+#include <ydb/library/yql/core/yql_graph_transformer.h>
+#include <ydb/library/yql/core/yql_expr_optimize.h>
+#include <ydb/library/yql/core/yql_expr_type_annotation.h>
+#include <ydb/core/kqp/provider/yql_kikimr_provider_impl.h>
+#include <ydb/library/yql/core/yql_opt_utils.h>
+
+namespace NKikimr {
+namespace NKqp {
+
+using namespace NYql;
+using namespace NYql::NNodes;
+
+/***
+ * Statistics transformer is a transformer that propagates statistics and costs from
+ * the leaves of the plan DAG up to the root of the DAG. It handles a number of operators,
+ * but will simply stop propagation if in encounters an operator that it has no rules for.
+ * One of such operators is EquiJoin, but there is a special rule to handle EquiJoin.
+*/
+class TKqpStatisticsTransformer : public TSyncTransformerBase {
+
+    TTypeAnnotationContext* typeCtx;
+    const TKikimrConfiguration::TPtr& Config;
+
+    public:
+        TKqpStatisticsTransformer(TTypeAnnotationContext& typeCtx, const TKikimrConfiguration::TPtr& config) : 
+            typeCtx(&typeCtx), Config(config) {}
+
+        // Main method of the transformer
+        IGraphTransformer::TStatus DoTransform(TExprNode::TPtr input, TExprNode::TPtr& output, TExprContext& ctx) final;
+
+        // Rewind currently does nothing
+        void Rewind() {
+
+    }
+};
+
+TAutoPtr<IGraphTransformer> CreateKqpStatisticsTransformer(TTypeAnnotationContext& typeCtx, 
+    const TKikimrConfiguration::TPtr& config);
+}
+}

+ 7 - 0
ydb/core/kqp/opt/logical/kqp_opt_log.cpp

@@ -33,6 +33,7 @@ public:
         AddHandler(0, &TCoTake::Match, HNDL(RewriteTakeSortToTopSort));
         AddHandler(0, &TCoFlatMap::Match, HNDL(RewriteSqlInToEquiJoin));
         AddHandler(0, &TCoFlatMap::Match, HNDL(RewriteSqlInCompactToJoin));
+        AddHandler(0, &TCoEquiJoin::Match, HNDL(OptimizeEquiJoinWithCosts));
         AddHandler(0, &TCoEquiJoin::Match, HNDL(RewriteEquiJoin));
         AddHandler(0, &TDqJoin::Match, HNDL(JoinToIndexLookup));
         AddHandler(0, &TCoCalcOverWindowBase::Match, HNDL(ExpandWindowFunctions));
@@ -123,6 +124,12 @@ protected:
         return output;
     }
 
+    TMaybeNode<TExprBase> OptimizeEquiJoinWithCosts(TExprBase node, TExprContext& ctx) {
+        TExprBase output = DqOptimizeEquiJoinWithCosts(node, ctx, TypesCtx, Config->HasOptEnableCostBasedOptimization());
+        DumpAppliedRule("OptimizeEquiJoinWithCosts", node.Ptr(), output.Ptr(), ctx);
+        return output;
+    }
+
     TMaybeNode<TExprBase> RewriteEquiJoin(TExprBase node, TExprContext& ctx) {
         TExprBase output = DqRewriteEquiJoin(node, KqpCtx.Config->GetHashJoinMode(), ctx);
         DumpAppliedRule("RewriteEquiJoin", node.Ptr(), output.Ptr(), ctx);

+ 1 - 0
ydb/core/kqp/opt/ya.make

@@ -12,6 +12,7 @@ SRCS(
     kqp_opt_range_legacy.cpp
     kqp_query_blocks_transformer.cpp
     kqp_query_plan.cpp
+    kqp_statistics_transformer.cpp
 )
 
 PEERDIR(

+ 6 - 0
ydb/core/kqp/provider/yql_kikimr_settings.cpp

@@ -63,6 +63,7 @@ TKikimrConfiguration::TKikimrConfiguration() {
     REGISTER_SETTING(*this, OptEnablePredicateExtract);
     REGISTER_SETTING(*this, OptEnableOlapPushdown);
     REGISTER_SETTING(*this, OptUseFinalizeByKey);
+    REGISTER_SETTING(*this, OptEnableCostBasedOptimization);
 
     /* Runtime */
     REGISTER_SETTING(*this, ScanQuery);
@@ -124,6 +125,11 @@ bool TKikimrSettings::HasOptUseFinalizeByKey() const {
     return GetOptionalFlagValue(OptUseFinalizeByKey.Get()) == EOptionalFlag::Enabled;
 }
 
+bool TKikimrSettings::HasOptEnableCostBasedOptimization() const {
+    return GetOptionalFlagValue(OptEnableCostBasedOptimization.Get()) == EOptionalFlag::Enabled;
+}
+
+
 EOptionalFlag TKikimrSettings::GetOptPredicateExtract() const {
     return GetOptionalFlagValue(OptEnablePredicateExtract.Get());
 }

Некоторые файлы не были показаны из-за большого количества измененных файлов