cbo_optimizer_new.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. #pragma once
  2. #include <util/generic/vector.h>
  3. #include <util/generic/string.h>
  4. #include <yql/essentials/core/yql_statistics.h>
  5. #include <yql/essentials/core/yql_cost_function.h>
  6. #include <unordered_map>
  7. #include <memory>
  8. #include <map>
  9. #include <sstream>
  10. #include <yql/essentials/core/cbo/cbo_interesting_orderings.h>
  11. namespace NYql {
  12. /**
  13. * OptimizerNodes are the internal representations of operators inside the
  14. * Cost-based optimizer. Currently we only support RelOptimizerNode - a node that
  15. * is an input relation to the equi-join, and JoinOptimizerNode - an inner join
  16. * that connects two sets of relations.
  17. */
  18. enum EOptimizerNodeKind: ui32
  19. {
  20. RelNodeType,
  21. JoinNodeType
  22. };
  23. /**
  24. * BaseOptimizerNode is a base class for the internal optimizer nodes
  25. * It records a pointer to statistics and records the current cost of the
  26. * operator tree, rooted at this node
  27. */
  28. struct IBaseOptimizerNode {
  29. EOptimizerNodeKind Kind;
  30. TOptimizerStatistics Stats;
  31. // for interesting orderings framework
  32. NDq::TOrderingsStateMachine::TLogicalOrderings LogicalOrderings;
  33. //
  34. IBaseOptimizerNode(EOptimizerNodeKind k) : Kind(k) {}
  35. IBaseOptimizerNode(EOptimizerNodeKind k, TOptimizerStatistics s) :
  36. Kind(k), Stats(std::move(s)) {}
  37. virtual TVector<TString> Labels()=0;
  38. virtual void Print(std::stringstream& stream, int ntabs=0)=0;
  39. };
  40. enum EJoinKind: ui32
  41. {
  42. InnerJoin,
  43. LeftJoin,
  44. RightJoin,
  45. OuterJoin,
  46. LeftOnly /* == LeftAntiJoin */,
  47. RightOnly /* == RightAntiJoin */,
  48. LeftSemi,
  49. RightSemi,
  50. Cross,
  51. Exclusion
  52. };
  53. EJoinKind ConvertToJoinKind(const TString& joinString);
  54. TString ConvertToJoinString(const EJoinKind kind);
  55. struct TCardinalityHints {
  56. enum ECardOperation : ui32 {
  57. Add,
  58. Subtract,
  59. Multiply,
  60. Divide,
  61. Replace
  62. };
  63. struct TCardinalityHint {
  64. TVector<TString> JoinLabels;
  65. ECardOperation Operation;
  66. double Value;
  67. TString StringRepr;
  68. bool Applied = false;
  69. double ApplyHint(double originalValue) {
  70. Applied = true;
  71. switch (Operation) {
  72. case Add:
  73. return originalValue + Value;
  74. case Subtract:
  75. return originalValue - Value;
  76. case Multiply:
  77. return originalValue * Value;
  78. case Divide:
  79. return originalValue / Value;
  80. case Replace:
  81. return Value;
  82. }
  83. }
  84. };
  85. TVector<TCardinalityHint> Hints;
  86. void PushBack(TVector<TString> labels, ECardOperation op, double value, TString stringRepr) {
  87. Hints.push_back({.JoinLabels = std::move(labels), .Operation = op, .Value = value, .StringRepr = std::move(stringRepr)});
  88. }
  89. };
  90. struct TJoinAlgoHints {
  91. struct TJoinAlgoHint {
  92. TVector<TString> JoinLabels;
  93. EJoinAlgoType Algo;
  94. TString StringRepr;
  95. bool Applied = false;
  96. };
  97. TVector<TJoinAlgoHint> Hints;
  98. void PushBack(TVector<TString> labels, EJoinAlgoType algo, TString stringRepr) {
  99. Hints.push_back({.JoinLabels = std::move(labels), .Algo = algo, .StringRepr = std::move(stringRepr)});
  100. }
  101. };
  102. struct TJoinOrderHints {
  103. struct ITreeNode {
  104. enum _ : ui32 {
  105. Relation,
  106. Join
  107. };
  108. virtual TVector<TString> Labels() = 0;
  109. bool IsRelation() { return Type == Relation; }
  110. bool IsJoin() { return Type == Join; }
  111. virtual ~ITreeNode() = default;
  112. ui32 Type;
  113. };
  114. struct TJoinNode: public ITreeNode {
  115. TJoinNode(std::shared_ptr<ITreeNode> lhs, std::shared_ptr<ITreeNode> rhs)
  116. : Lhs(std::move(lhs))
  117. , Rhs(std::move(rhs))
  118. {
  119. this->Type = ITreeNode::Join;
  120. }
  121. TVector<TString> Labels() override {
  122. auto labels = Lhs->Labels();
  123. auto rhsLabels = Rhs->Labels();
  124. labels.insert(labels.end(), std::make_move_iterator(rhsLabels.begin()), std::make_move_iterator(rhsLabels.end()));
  125. return labels;
  126. }
  127. std::shared_ptr<ITreeNode> Lhs;
  128. std::shared_ptr<ITreeNode> Rhs;
  129. };
  130. struct TRelationNode: public ITreeNode {
  131. TRelationNode(TString label)
  132. : Label(std::move(label))
  133. {
  134. this->Type = ITreeNode::Relation;
  135. }
  136. TVector<TString> Labels() override { return {Label}; }
  137. TString Label;
  138. };
  139. struct TJoinOrderHint {
  140. std::shared_ptr<ITreeNode> Tree;
  141. TString StringRepr;
  142. bool Applied = false;
  143. };
  144. TVector<TJoinOrderHint> Hints;
  145. void PushBack(std::shared_ptr<ITreeNode> hintTree, TString stringRepr) {
  146. Hints.push_back({.Tree = std::move(hintTree), .StringRepr = std::move(stringRepr)});
  147. }
  148. };
  149. struct TOptimizerHints {
  150. std::shared_ptr<TCardinalityHints> CardinalityHints = std::make_shared<TCardinalityHints>();
  151. std::shared_ptr<TJoinAlgoHints> JoinAlgoHints = std::make_shared<TJoinAlgoHints>();
  152. std::shared_ptr<TJoinOrderHints> JoinOrderHints = std::make_shared<TJoinOrderHints>();
  153. TVector<TString> GetUnappliedString();
  154. /*
  155. * The function accepts string with three type of expressions: array of (JoinAlgo | Card | JoinOrder):
  156. * 1) JoinAlgo(t1 t2 ... tn Map | Grace | Lookup) to change join algo for join, where these labels take part
  157. * 2) Card(t1 t2 ... tn (*|/|+|-) Number) to change cardinality for join, where these labels take part or labels only
  158. * 3) JoinOrder( (t1 t2) (t3 (t4 ...)) ) - fixate this join subtree in the general join tree
  159. */
  160. static TOptimizerHints Parse(const TString&);
  161. };
  162. /**
  163. * This is a temporary structure for KQP provider
  164. * We will soon be supporting multiple providers and we will need to design
  165. * some interfaces to pass provider-specific context to the optimizer
  166. */
  167. struct IProviderContext {
  168. virtual ~IProviderContext() = default;
  169. virtual double ComputeJoinCost(
  170. const TOptimizerStatistics& leftStats,
  171. const TOptimizerStatistics& rightStats,
  172. const double outputRows,
  173. const double outputByteSize,
  174. EJoinAlgoType joinAlgo
  175. ) const = 0;
  176. virtual TOptimizerStatistics ComputeJoinStats(
  177. const TOptimizerStatistics& leftStats,
  178. const TOptimizerStatistics& rightStats,
  179. const TVector<NDq::TJoinColumn>& leftJoinKeys,
  180. const TVector<NDq::TJoinColumn>& rightJoinKeys,
  181. EJoinAlgoType joinAlgo,
  182. EJoinKind joinKind,
  183. TCardinalityHints::TCardinalityHint* maybeHint = nullptr
  184. ) const = 0;
  185. virtual TOptimizerStatistics ComputeJoinStatsV1(
  186. const TOptimizerStatistics& leftStats,
  187. const TOptimizerStatistics& rightStats,
  188. const TVector<NDq::TJoinColumn>& leftJoinKeys,
  189. const TVector<NDq::TJoinColumn>& rightJoinKeys,
  190. EJoinAlgoType joinAlgo,
  191. EJoinKind joinKind,
  192. TCardinalityHints::TCardinalityHint* maybeHint,
  193. bool shuffleLeftSide,
  194. bool shuffleRightSide
  195. ) const = 0;
  196. virtual bool IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left,
  197. const std::shared_ptr<IBaseOptimizerNode>& right,
  198. const TVector<NDq::TJoinColumn>& leftJoinKeys,
  199. const TVector<NDq::TJoinColumn>& rightJoinKeys,
  200. EJoinAlgoType joinAlgo,
  201. EJoinKind joinKind
  202. ) = 0;
  203. };
  204. /**
  205. * Default provider context with default cost and stats computation.
  206. */
  207. struct TBaseProviderContext : public IProviderContext {
  208. TBaseProviderContext() {}
  209. double ComputeJoinCost(
  210. const TOptimizerStatistics& leftStats,
  211. const TOptimizerStatistics& rightStats,
  212. const double outputRows,
  213. const double outputByteSize,
  214. EJoinAlgoType joinAlgo
  215. ) const override;
  216. bool IsJoinApplicable(
  217. const std::shared_ptr<IBaseOptimizerNode>& leftStats,
  218. const std::shared_ptr<IBaseOptimizerNode>& rightStats,
  219. const TVector<NDq::TJoinColumn>& leftJoinKeys,
  220. const TVector<NDq::TJoinColumn>& rightJoinKeys,
  221. EJoinAlgoType joinAlgo,
  222. EJoinKind joinKind
  223. ) override;
  224. TOptimizerStatistics ComputeJoinStats(
  225. const TOptimizerStatistics& leftStats,
  226. const TOptimizerStatistics& rightStats,
  227. const TVector<NDq::TJoinColumn>& leftJoinKeys,
  228. const TVector<NDq::TJoinColumn>& rightJoinKeys,
  229. EJoinAlgoType joinAlgo,
  230. EJoinKind joinKind,
  231. TCardinalityHints::TCardinalityHint* maybeHint = nullptr
  232. ) const override;
  233. TOptimizerStatistics ComputeJoinStatsV1(
  234. const TOptimizerStatistics& leftStats,
  235. const TOptimizerStatistics& rightStats,
  236. const TVector<NDq::TJoinColumn>& leftJoinKeys,
  237. const TVector<NDq::TJoinColumn>& rightJoinKeys,
  238. EJoinAlgoType joinAlgo,
  239. EJoinKind joinKind,
  240. TCardinalityHints::TCardinalityHint* maybeHint,
  241. bool shuffleLeftSide,
  242. bool shuffleRightSide
  243. ) const override;
  244. static const TBaseProviderContext& Instance();
  245. };
  246. /**
  247. * RelOptimizerNode adds a label to base class
  248. * This is the label assinged to the input by equi-Join
  249. */
  250. struct TRelOptimizerNode : public IBaseOptimizerNode {
  251. TString Label;
  252. TRelOptimizerNode(TString label, TOptimizerStatistics stats) :
  253. IBaseOptimizerNode(RelNodeType, std::move(stats)), Label(label) { }
  254. virtual ~TRelOptimizerNode() {}
  255. virtual TVector<TString> Labels();
  256. virtual void Print(std::stringstream& stream, int ntabs=0);
  257. };
  258. /**
  259. * JoinOptimizerNode records the left and right arguments of the join
  260. * as well as the set of join conditions.
  261. * It also has methods to compute the statistics and cost of a join,
  262. * based on pre-computed costs and statistics of the children.
  263. */
  264. struct TJoinOptimizerNode : public IBaseOptimizerNode {
  265. std::shared_ptr<IBaseOptimizerNode> LeftArg;
  266. std::shared_ptr<IBaseOptimizerNode> RightArg;
  267. TVector<NDq::TJoinColumn> LeftJoinKeys;
  268. TVector<NDq::TJoinColumn> RightJoinKeys;
  269. EJoinKind JoinType;
  270. EJoinAlgoType JoinAlgo;
  271. /////////////////// 'ANY' flag means leaving only one row from the join side.
  272. bool LeftAny;
  273. bool RightAny;
  274. ///////////////////
  275. bool IsReorderable;
  276. TJoinOptimizerNode(const std::shared_ptr<IBaseOptimizerNode>& left,
  277. const std::shared_ptr<IBaseOptimizerNode>& right,
  278. TVector<NDq::TJoinColumn> leftKeys,
  279. TVector<NDq::TJoinColumn> rightKeys,
  280. const EJoinKind joinType,
  281. const EJoinAlgoType joinAlgo,
  282. bool leftAny,
  283. bool rightAny,
  284. bool nonReorderable = false
  285. );
  286. virtual ~TJoinOptimizerNode() {}
  287. virtual TVector<TString> Labels();
  288. virtual void Print(std::stringstream& stream, int ntabs=0);
  289. };
  290. struct IOptimizerNew {
  291. using TPtr = std::shared_ptr<IOptimizerNew>;
  292. IProviderContext& Pctx;
  293. IOptimizerNew(IProviderContext& ctx) : Pctx(ctx) {}
  294. virtual ~IOptimizerNew() = default;
  295. virtual std::shared_ptr<TJoinOptimizerNode> JoinSearch(
  296. const std::shared_ptr<TJoinOptimizerNode>& joinTree,
  297. const TOptimizerHints& hints = {}
  298. ) = 0;
  299. };
  300. struct TExprContext;
  301. class IOptimizerFactory : private TNonCopyable {
  302. public:
  303. using TPtr = std::shared_ptr<IOptimizerFactory>;
  304. using TLogger = std::function<void(const TString&)>;
  305. virtual ~IOptimizerFactory() = default;
  306. struct TNativeSettings {
  307. ui32 MaxDPhypDPTableSize = 100000;
  308. };
  309. virtual IOptimizerNew::TPtr MakeJoinCostBasedOptimizerNative(IProviderContext& pctx, TExprContext& ctx, const TNativeSettings& settings) const = 0;
  310. struct TPGSettings {
  311. TLogger Logger = [](const TString&){};
  312. };
  313. virtual IOptimizerNew::TPtr MakeJoinCostBasedOptimizerPG(IProviderContext& pctx, TExprContext& ctx, const TPGSettings& settings) const = 0;
  314. };
  315. } // namespace NYql