123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308 |
- #include "cbo_optimizer_new.h"
- #include <array>
- #include <util/string/builder.h>
- #include <util/generic/hash.h>
- #include <util/generic/hash_set.h>
- #include <util/string/cast.h>
- #include <util/string/join.h>
- #include <util/string/printf.h>
- const TString& ToString(NYql::EJoinKind);
- const TString& ToString(NYql::EJoinAlgoType);
- namespace NYql {
- using namespace NYql::NDq;
- namespace {
- THashMap<TString,EJoinKind> JoinKindMap = {
- {"Inner",EJoinKind::InnerJoin},
- {"Left",EJoinKind::LeftJoin},
- {"Right",EJoinKind::RightJoin},
- {"Full",EJoinKind::OuterJoin},
- {"LeftOnly",EJoinKind::LeftOnly},
- {"RightOnly",EJoinKind::RightOnly},
- {"Exclusion",EJoinKind::Exclusion},
- {"LeftSemi",EJoinKind::LeftSemi},
- {"RightSemi",EJoinKind::RightSemi},
- {"Cross",EJoinKind::Cross}};
- THashMap<TString,TCardinalityHints::ECardOperation> HintOpMap = {
- {"+",TCardinalityHints::ECardOperation::Add},
- {"-",TCardinalityHints::ECardOperation::Subtract},
- {"*",TCardinalityHints::ECardOperation::Multiply},
- {"/",TCardinalityHints::ECardOperation::Divide},
- {"#",TCardinalityHints::ECardOperation::Replace}};
- }
- EJoinKind ConvertToJoinKind(const TString& joinString) {
- auto maybeKind = JoinKindMap.find(joinString);
- Y_ENSURE(maybeKind != JoinKindMap.end());
- return maybeKind->second;
- }
- TString ConvertToJoinString(const EJoinKind kind) {
- for (auto [k,v] : JoinKindMap) {
- if (v == kind) {
- return k;
- }
- }
- Y_ENSURE(false,"Unknown join kind");
- }
- TVector<TString> TRelOptimizerNode::Labels() {
- TVector<TString> res;
- res.emplace_back(Label);
- return res;
- }
- void TRelOptimizerNode::Print(std::stringstream& stream, int ntabs) {
- for (int i = 0; i < ntabs; i++){
- stream << " ";
- }
- stream << "Rel: " << Label << "\n";
- for (int i = 0; i < ntabs; i++){
- stream << " ";
- }
- stream << Stats << "\n";
- }
- TJoinOptimizerNode::TJoinOptimizerNode(
- const std::shared_ptr<IBaseOptimizerNode>& left,
- const std::shared_ptr<IBaseOptimizerNode>& right,
- TVector<TJoinColumn> leftKeys,
- TVector<TJoinColumn> rightKeys,
- const EJoinKind joinType,
- const EJoinAlgoType joinAlgo,
- bool leftAny,
- bool rightAny,
- bool nonReorderable
- ) : IBaseOptimizerNode(JoinNodeType)
- , LeftArg(left)
- , RightArg(right)
- , LeftJoinKeys(leftKeys)
- , RightJoinKeys(rightKeys)
- , JoinType(joinType)
- , JoinAlgo(joinAlgo)
- , LeftAny(leftAny)
- , RightAny(rightAny)
- , IsReorderable(!nonReorderable)
- {}
- TVector<TString> TJoinOptimizerNode::Labels() {
- auto res = LeftArg->Labels();
- auto rightLabels = RightArg->Labels();
- res.insert(res.begin(),rightLabels.begin(),rightLabels.end());
- return res;
- }
- void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) {
- for (int i = 0; i < ntabs; i++){
- stream << " ";
- }
- stream << "Join: (" << ToString(JoinType) << "," << ToString(JoinAlgo);
- if (LeftAny) {
- stream << ",LeftAny";
- }
- if (RightAny) {
- stream << ",RightAny";
- }
- stream << ") ";
- for (size_t i=0; i<LeftJoinKeys.size(); i++){
- stream << LeftJoinKeys[i].RelName << "." << LeftJoinKeys[i].AttributeName
- << "=" << RightJoinKeys[i].RelName << "."
- << RightJoinKeys[i].AttributeName << ",";
- }
- stream << "\n";
- for (int i = 0; i < ntabs; i++){
- stream << " ";
- }
- stream << Stats << "\n";
-
- LeftArg->Print(stream, ntabs+1);
- RightArg->Print(stream, ntabs+1);
- }
- bool IsPKJoin(const TOptimizerStatistics& stats, const TVector<TJoinColumn>& joinKeys) {
- if (!stats.KeyColumns) {
- return false;
- }
- for(size_t i = 0; i < stats.KeyColumns->Data.size(); i++){
- if (std::find_if(joinKeys.begin(), joinKeys.end(),
- [&] (const TJoinColumn& c) { return c.AttributeName == stats.KeyColumns->Data[i];}) == joinKeys.end()) {
- return false;
- }
- }
- return true;
- }
- bool TBaseProviderContext::IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left,
- const std::shared_ptr<IBaseOptimizerNode>& right,
- const TVector<TJoinColumn>& leftJoinKeys,
- const TVector<TJoinColumn>& rightJoinKeys,
- EJoinAlgoType joinAlgo,
- EJoinKind joinKind) {
- Y_UNUSED(left);
- Y_UNUSED(right);
- Y_UNUSED(leftJoinKeys);
- Y_UNUSED(rightJoinKeys);
- Y_UNUSED(joinKind);
- return joinAlgo == EJoinAlgoType::MapJoin;
- }
- double TBaseProviderContext::ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const {
- Y_UNUSED(outputByteSize);
- Y_UNUSED(joinAlgo);
- return leftStats.Nrows + 2.0 * rightStats.Nrows + outputRows;
- }
- /**
- * Compute the cost and output cardinality of a join
- *
- * Currently a very basic computation targeted at GraceJoin
- *
- * The build is on the right side, so we make the build side a bit more expensive than the probe
- */
- TOptimizerStatistics TBaseProviderContext::ComputeJoinStats(
- const TOptimizerStatistics& leftStats,
- const TOptimizerStatistics& rightStats,
- const TVector<TJoinColumn>& leftJoinKeys,
- const TVector<TJoinColumn>& rightJoinKeys,
- EJoinAlgoType joinAlgo,
- EJoinKind joinKind,
- TCardinalityHints::TCardinalityHint* maybeHint) const
- {
- double newCard{};
- EStatisticsType outputType;
- bool leftKeyColumns = false;
- bool rightKeyColumns = false;
- double selectivity = 1.0;
- bool isRightPKJoin = IsPKJoin(rightStats,rightJoinKeys);
- bool isLeftPKJoin = IsPKJoin(leftStats,leftJoinKeys);
- if (isRightPKJoin && isLeftPKJoin) {
- auto rightPKJoinCard = leftStats.Nrows * rightStats.Selectivity;
- auto leftPKJoinCard = rightStats.Nrows * leftStats.Selectivity;
- if (rightPKJoinCard > leftPKJoinCard) {
- isRightPKJoin = false;
- }
- }
- if (isRightPKJoin) {
- switch (joinKind) {
- case EJoinKind::LeftJoin:
- case EJoinKind::LeftOnly:
- newCard = leftStats.Nrows; break;
- default: {
- newCard = leftStats.Nrows * rightStats.Selectivity;
- }
- }
- selectivity = leftStats.Selectivity * rightStats.Selectivity;
- leftKeyColumns = true;
- if (leftStats.Type == EStatisticsType::BaseTable){
- outputType = EStatisticsType::FilteredFactTable;
- } else {
- outputType = leftStats.Type;
- }
- } else if (isLeftPKJoin) {
- switch (joinKind) {
- case EJoinKind::RightJoin:
- case EJoinKind::RightOnly:
- newCard = rightStats.Nrows; break;
- default: {
- newCard = leftStats.Selectivity * rightStats.Nrows;
- }
- }
-
- selectivity = leftStats.Selectivity * rightStats.Selectivity;
- rightKeyColumns = true;
- if (rightStats.Type == EStatisticsType::BaseTable){
- outputType = EStatisticsType::FilteredFactTable;
- } else {
- outputType = rightStats.Type;
- }
- } else {
- std::optional<double> lhsUniqueVals;
- std::optional<double> rhsUniqueVals;
- if (leftStats.ColumnStatistics && rightStats.ColumnStatistics && !leftJoinKeys.empty() && !rightJoinKeys.empty()) {
- auto lhs = leftJoinKeys[0].AttributeName;
- lhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals;
- auto rhs = rightJoinKeys[0].AttributeName;
- rightStats.ColumnStatistics->Data[rhs];
- rhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals;
- }
- if (lhsUniqueVals.has_value() && rhsUniqueVals.has_value()) {
- newCard = leftStats.Nrows * rightStats.Nrows / std::max(*lhsUniqueVals, *rhsUniqueVals);
- } else {
- newCard = 0.2 * leftStats.Nrows * rightStats.Nrows;
- }
- outputType = EStatisticsType::ManyManyJoin;
- }
- if (maybeHint) {
- newCard = maybeHint->ApplyHint(newCard);
- }
- int newNCols = leftStats.Ncols + rightStats.Ncols;
- double newByteSize = leftStats.Nrows ? (leftStats.ByteSize / leftStats.Nrows) * newCard : 0 +
- rightStats.Nrows ? (rightStats.ByteSize / rightStats.Nrows) * newCard : 0;
- double cost = ComputeJoinCost(leftStats, rightStats, newCard, newByteSize, joinAlgo)
- + leftStats.Cost + rightStats.Cost;
- auto result = TOptimizerStatistics(outputType, newCard, newNCols, newByteSize, cost,
- leftKeyColumns ? leftStats.KeyColumns : ( rightKeyColumns ? rightStats.KeyColumns : TIntrusivePtr<TOptimizerStatistics::TKeyColumns>()));
- result.Selectivity = selectivity;
- return result;
- }
- const TBaseProviderContext& TBaseProviderContext::Instance() {
- static TBaseProviderContext staticContext;
- return staticContext;
- }
- TVector<TString> TOptimizerHints::GetUnappliedString() {
- TVector<TString> res;
- for (const auto& hint: JoinAlgoHints->Hints) {
- if (!hint.Applied) {
- res.push_back(hint.StringRepr);
- }
- }
- for (const auto& hint: JoinOrderHints->Hints) {
- if (!hint.Applied) {
- res.push_back(hint.StringRepr);
- }
- }
- for (const auto& hint: CardinalityHints->Hints) {
- if (!hint.Applied) {
- res.push_back(hint.StringRepr);
- }
- }
- return res;
- }
- } // namespace NYql
|