yql_opt_match_recognize.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. #include "yql_opt_match_recognize.h"
  2. #include "yql_opt_utils.h"
  3. #include <yql/essentials/core/sql_types/time_order_recover.h>
  4. #include <yql/essentials/core/yql_expr_optimize.h>
  5. #include <yql/essentials/utils/log/log.h>
  6. namespace NYql {
  7. using namespace NNodes;
  8. namespace {
  9. bool IsStreaming(const TExprNode::TPtr& input, const TTypeAnnotationContext& typeAnnCtx) {
  10. if (EMatchRecognizeStreamingMode::Disable == typeAnnCtx.MatchRecognizeStreaming){
  11. return false;
  12. }
  13. if (EMatchRecognizeStreamingMode::Force == typeAnnCtx.MatchRecognizeStreaming){
  14. return true;
  15. }
  16. YQL_ENSURE(EMatchRecognizeStreamingMode::Auto == typeAnnCtx.MatchRecognizeStreaming, "Internal logic error");
  17. bool hasPq = false;
  18. NYql::VisitExpr(input, [&hasPq](const TExprNode::TPtr& node){
  19. if (node->IsCallable("DataSource")) {
  20. YQL_ENSURE(node->ChildrenSize() > 0 and node->Child(0)->IsAtom());
  21. hasPq = node->Child(0)->Content() == "pq";
  22. }
  23. return !hasPq;
  24. });
  25. return hasPq;
  26. }
  27. TExprNode::TPtr ExpandMatchRecognizeMeasuresAggregates(const TExprNode::TPtr& node, TExprContext& ctx, TTypeAnnotationContext& /* typeAnnCtx */) {
  28. const auto pos = node->Pos();
  29. const auto vars = node->Child(3);
  30. static constexpr size_t AggregatesLambdasStartPos = 4;
  31. static constexpr size_t MeasuresLambdasStartPos = 3;
  32. return ctx.Builder(pos)
  33. .Callable("MatchRecognizeMeasures")
  34. .Add(0, node->ChildPtr(0))
  35. .Add(1, node->ChildPtr(1))
  36. .Add(2, node->ChildPtr(2))
  37. .Do([&](TExprNodeBuilder& parent) -> TExprNodeBuilder& {
  38. for (size_t i = 0; i < vars->ChildrenSize(); ++i) {
  39. const auto var = vars->Child(i)->Content();
  40. const auto handler = node->ChildPtr(AggregatesLambdasStartPos + i);
  41. if (!var) {
  42. auto value = handler->GetTypeAnn()->GetKind() != ETypeAnnotationKind::Optional
  43. ? ctx.Builder(pos).Callable("Just").Add(0, handler).Seal().Build()
  44. : handler;
  45. parent.Add(
  46. MeasuresLambdasStartPos + i,
  47. ctx.Builder(pos)
  48. .Lambda()
  49. .Param("data")
  50. .Param("vars")
  51. .Add(0, std::move(value))
  52. .Seal()
  53. .Build()
  54. );
  55. continue;
  56. }
  57. parent.Add(
  58. MeasuresLambdasStartPos + i,
  59. ctx.Builder(pos)
  60. .Lambda()
  61. .Param("data")
  62. .Param("vars")
  63. .Callable(0, "Member")
  64. .Callable(0, "Head")
  65. .Callable(0, "Aggregate")
  66. .Callable(0, "OrderedMap")
  67. .Callable(0, "OrderedFlatMap")
  68. .Callable(0, "Member")
  69. .Arg(0, "vars")
  70. .Atom(1, var)
  71. .Seal()
  72. .Lambda(1)
  73. .Param("item")
  74. .Callable(0, "ListFromRange")
  75. .Callable(0, "Member")
  76. .Arg(0, "item")
  77. .Atom(1, "From")
  78. .Seal()
  79. .Callable(1, "+MayWarn")
  80. .Callable(0, "Member")
  81. .Arg(0, "item")
  82. .Atom(1, "To")
  83. .Seal()
  84. .Callable(1, "Uint64")
  85. .Atom(0, "1")
  86. .Seal()
  87. .Seal()
  88. .Seal()
  89. .Seal()
  90. .Seal()
  91. .Lambda(1)
  92. .Param("index")
  93. .Callable(0, "Unwrap")
  94. .Callable(0, "Lookup")
  95. .Callable(0, "ToIndexDict")
  96. .Arg(0, "data")
  97. .Seal()
  98. .Arg(1, "index")
  99. .Seal()
  100. .Seal()
  101. .Seal()
  102. .Seal()
  103. .List(1).Seal()
  104. .List(2)
  105. .Add(0, handler)
  106. .Seal()
  107. .List(3).Seal()
  108. .Seal()
  109. .Seal()
  110. .Atom(1, handler->Child(0)->Content())
  111. .Seal()
  112. .Seal()
  113. .Build()
  114. );
  115. }
  116. return parent;
  117. })
  118. .Seal()
  119. .Build();
  120. }
  121. THashSet<TStringBuf> FindUsedVars(const TExprNode::TPtr& params) {
  122. THashSet<TStringBuf> result;
  123. const auto measures = params->Child(0);
  124. const auto measuresVars = measures->Child(3);
  125. for (const auto& var : measuresVars->Children()) {
  126. result.insert(var->Content());
  127. }
  128. const auto defines = params->Child(4);
  129. static constexpr size_t defineLambdasStartPos = 3;
  130. for (auto i = defineLambdasStartPos; i < defines->ChildrenSize(); ++i) {
  131. const auto lambda = defines->Child(i);
  132. const auto lambdaArgs = lambda->Child(0);
  133. const auto lambdaBody = lambda->ChildPtr(1);
  134. const auto varsArg = lambdaArgs->Child(1);
  135. NYql::VisitExpr(
  136. lambdaBody,
  137. [varsArg, &result](const TExprNode::TPtr& node) {
  138. if (node->IsCallable("Member") && node->Child(0) == varsArg) {
  139. result.insert(node->Child(1)->Content());
  140. return false;
  141. }
  142. return true;
  143. }
  144. );
  145. }
  146. return result;
  147. }
  148. TExprNode::TPtr MarkUnusedPatternVars(const TExprNode::TPtr& node, TExprContext& ctx, const THashSet<TStringBuf>& usedVars, TStringBuf rowsPerMatch) {
  149. const auto pos = node->Pos();
  150. if (node->ChildrenSize() != 0 && node->Child(0)->IsAtom()) {
  151. const auto varName = node->Child(0)->Content();
  152. const auto output = FromString<bool>(node->Child(4)->Content());
  153. const auto varUnused = ("RowsPerMatch_AllRows" != rowsPerMatch || !output) && !usedVars.contains(varName);
  154. return ctx.Builder(pos)
  155. .List()
  156. .Add(0, node->ChildPtr(0))
  157. .Add(1, node->ChildPtr(1))
  158. .Add(2, node->ChildPtr(2))
  159. .Add(3, node->ChildPtr(3))
  160. .Add(4, node->ChildPtr(4))
  161. .Add(5, ctx.NewAtom(pos, ToString(varUnused)))
  162. .Seal()
  163. .Build();
  164. }
  165. TExprNodeList newChildren;
  166. for (size_t chPos = 0; chPos != node->ChildrenSize(); chPos++) {
  167. newChildren.push_back(MarkUnusedPatternVars(node->ChildPtr(chPos), ctx, usedVars, rowsPerMatch));
  168. }
  169. if (node->IsCallable()) {
  170. return ctx.Builder(pos).Callable(node->Content()).Add(std::move(newChildren)).Seal().Build();
  171. } else if (node->IsList()) {
  172. return ctx.Builder(pos).List().Add(std::move(newChildren)).Seal().Build();
  173. } else { // Atom
  174. return node;
  175. }
  176. }
  177. } // anonymous namespace
  178. TExprNode::TPtr ExpandMatchRecognize(const TExprNode::TPtr& node, TExprContext& ctx, TTypeAnnotationContext& typeAnnCtx) {
  179. YQL_ENSURE(node->IsCallable("MatchRecognize"));
  180. const auto input = node->Child(0);
  181. const auto partitionKeySelector = node->Child(1);
  182. const auto partitionColumns = node->Child(2);
  183. const auto sortTraits = node->Child(3);
  184. const auto params = node->Child(4);
  185. const auto pos = node->Pos();
  186. const bool isStreaming = IsStreaming(input, typeAnnCtx);
  187. TExprNode::TPtr settings = AddSetting(*ctx.NewList(pos, {}), pos,
  188. "Streaming", ctx.NewAtom(pos, ToString(isStreaming)), ctx);
  189. const auto rowsPerMatch = params->Child(1)->Content();
  190. auto measures = ExpandMatchRecognizeMeasuresAggregates(params->ChildPtr(0), ctx, typeAnnCtx);
  191. const auto matchRecognize = ctx.Builder(pos)
  192. .Lambda()
  193. .Param("sortedPartition")
  194. .Callable(0, "ForwardList")
  195. .Callable(0, "MatchRecognizeCore")
  196. .Callable(0, "ToFlow")
  197. .Arg(0, "sortedPartition")
  198. .Seal()
  199. .Add(1, partitionKeySelector)
  200. .Add(2, partitionColumns)
  201. .Callable(3, params->Content())
  202. .Add(0, std::move(measures))
  203. .Add(1, params->ChildPtr(1))
  204. .Add(2, params->ChildPtr(2))
  205. .Add(3, MarkUnusedPatternVars(params->ChildPtr(3), ctx, FindUsedVars(params), rowsPerMatch))
  206. .Add(4, params->ChildPtr(4))
  207. .Seal()
  208. .Add(4, settings)
  209. .Seal()
  210. .Seal()
  211. .Seal()
  212. .Build();
  213. TExprNode::TPtr sortKey;
  214. TExprNode::TPtr sortOrder;
  215. ExtractSortKeyAndOrder(pos, sortTraits, sortKey, sortOrder, ctx);
  216. TExprNode::TPtr result;
  217. if (isStreaming) {
  218. YQL_ENSURE(sortOrder->ChildrenSize() == 1, "Expect ORDER BY timestamp for MATCH_RECOGNIZE");
  219. const auto reordered = ctx.Builder(pos)
  220. .Lambda()
  221. .Param("partition")
  222. .Callable("ForwardList")
  223. .Callable(0, "OrderedMap")
  224. .Callable(0, "TimeOrderRecover")
  225. .Callable(0, "ToFlow").
  226. Arg(0, "partition")
  227. .Seal()
  228. .Add(1, sortKey)
  229. .Callable(2, "Interval")
  230. .Add(0, ctx.NewAtom(pos, ToString(typeAnnCtx.TimeOrderRecoverDelay)))
  231. .Seal()
  232. .Callable(3, "Interval")
  233. .Add(0, ctx.NewAtom(pos, ToString(typeAnnCtx.TimeOrderRecoverAhead)))
  234. .Seal()
  235. .Callable(4, "Uint32")
  236. .Add(0, ctx.NewAtom(pos, ToString(typeAnnCtx.TimeOrderRecoverRowLimit)))
  237. .Seal()
  238. .Seal()
  239. .Lambda(1)
  240. .Param("row")
  241. .Callable("RemoveMember")
  242. .Arg(0, "row")
  243. .Add(1, ctx.NewAtom(pos, NYql::NTimeOrderRecover::OUT_OF_ORDER_MARKER))
  244. .Seal()
  245. .Seal()
  246. .Seal()
  247. .Seal()
  248. .Seal()
  249. .Build();
  250. const auto matchRecognizeOnReorderedPartition = ctx.Builder(pos)
  251. .Lambda()
  252. .Param("partition")
  253. .Apply(matchRecognize)
  254. .With(0)
  255. .Apply(reordered)
  256. .With(0)
  257. .Arg("partition")
  258. .Done()
  259. .Seal()
  260. .Done()
  261. .Seal()
  262. .Seal()
  263. .Build();
  264. TExprNode::TPtr keySelector;
  265. if (partitionColumns->ChildrenSize() != 0) {
  266. keySelector = partitionKeySelector;
  267. } else {
  268. //Use pseudo partitioning with constant lambda to wrap TimeOrderRecover into DQ stage
  269. //TODO(zverevgeny): fixme
  270. keySelector = ctx.Builder(pos)
  271. .Lambda()
  272. .Param("row")
  273. .Callable("Bool")
  274. .Add(0, ctx.NewAtom(pos, "true"))
  275. .Seal()
  276. .Seal()
  277. .Build();
  278. }
  279. result = ctx.Builder(pos)
  280. .Callable("ShuffleByKeys")
  281. .Add(0, input)
  282. .Add(1, keySelector)
  283. .Add(2, matchRecognizeOnReorderedPartition)
  284. .Seal()
  285. .Build();
  286. } else { //non-streaming
  287. result = ctx.Builder(pos)
  288. .Callable("PartitionsByKeys")
  289. .Add(0, input)
  290. .Add(1, partitionKeySelector)
  291. .Add(2, sortOrder)
  292. .Add(3, sortKey)
  293. .Add(4, matchRecognize)
  294. .Seal()
  295. .Build();
  296. }
  297. YQL_CLOG(INFO, Core) << "Expanded MatchRecognize";
  298. return result;
  299. }
  300. } //namespace NYql