mkql_join_dict.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. #include "mkql_join_dict.h"
  2. #include <yql/essentials/minikql/computation/mkql_custom_list.h>
  3. #include <yql/essentials/minikql/computation/mkql_computation_node_holders.h>
  4. #include <yql/essentials/minikql/computation/mkql_computation_node_codegen.h> // Y_IGNORE
  5. #include <yql/essentials/minikql/mkql_node_cast.h>
  6. #include <yql/essentials/minikql/mkql_program_builder.h>
  7. namespace NKikimr {
  8. namespace NMiniKQL {
  9. namespace {
  10. template <EJoinKind Kind>
  11. struct TWrapTraits {
  12. static constexpr bool Wrap1 = IsLeftOptional(Kind);
  13. static constexpr bool Wrap2 = IsRightOptional(Kind);
  14. };
  15. template <bool KeyTuple>
  16. class TJoinDictWrapper : public TMutableCodegeneratorPtrNode<TJoinDictWrapper<KeyTuple>> {
  17. typedef TMutableCodegeneratorPtrNode<TJoinDictWrapper<KeyTuple>> TBaseComputation;
  18. public:
  19. TJoinDictWrapper(TComputationMutables& mutables, IComputationNode* dict1, IComputationNode* dict2,
  20. bool isMulti1, bool isMulti2, EJoinKind joinKind, std::vector<ui32>&& indexes = std::vector<ui32>())
  21. : TBaseComputation(mutables, EValueRepresentation::Boxed)
  22. , Dict1(dict1)
  23. , Dict2(dict2)
  24. , IsMulti1(isMulti1)
  25. , IsMulti2(isMulti2)
  26. , JoinKind(joinKind)
  27. , OptIndicies(std::move(indexes))
  28. {
  29. }
  30. NUdf::TUnboxedValuePod DoCalculate(TComputationContext& ctx) const {
  31. const auto& dict1 = Dict1->GetValue(ctx);
  32. const auto& dict2 = Dict2->GetValue(ctx);
  33. return JoinDicts(ctx, dict1, dict2);
  34. }
  35. #ifndef MKQL_DISABLE_CODEGEN
  36. void DoGenerateGetValue(const TCodegenContext& ctx, Value* pointer, BasicBlock*& block) const {
  37. auto& context = ctx.Codegen.GetContext();
  38. const auto joinFunc = ConstantInt::get(Type::getInt64Ty(context), GetMethodPtr(&TJoinDictWrapper::JoinDicts));
  39. const auto joinFuncArg = ConstantInt::get(Type::getInt64Ty(context), (ui64)this);
  40. const auto one = GetNodeValue(Dict1, ctx, block);
  41. const auto two = GetNodeValue(Dict2, ctx, block);
  42. const auto joinFuncType = FunctionType::get(Type::getInt128Ty(context),
  43. { joinFuncArg->getType(), ctx.Ctx->getType(), one->getType(), two->getType() }, false);
  44. const auto joinFuncPtr = CastInst::Create(Instruction::IntToPtr, joinFunc, PointerType::getUnqual(joinFuncType), "cast", block);
  45. const auto join = CallInst::Create(joinFuncType, joinFuncPtr, { joinFuncArg, ctx.Ctx, one, two }, "join", block);
  46. AddRefBoxed(join, ctx, block);
  47. new StoreInst(join, pointer, block);
  48. }
  49. #endif
  50. private:
  51. void RegisterDependencies() const final {
  52. this->DependsOn(Dict1);
  53. this->DependsOn(Dict2);
  54. }
  55. bool HasNullInKey(const NUdf::TUnboxedValue& key) const {
  56. if (!key) {
  57. return true;
  58. }
  59. if constexpr (KeyTuple) {
  60. for (ui32 index : OptIndicies) {
  61. if (!key.GetElement(index)) {
  62. return true;
  63. }
  64. }
  65. }
  66. return false;
  67. }
  68. template <EJoinKind Kind>
  69. void WriteValuesImpl(const NUdf::TUnboxedValuePod& payload1, const NUdf::TUnboxedValuePod& payload2,
  70. TDefaultListRepresentation& resList, TComputationContext& ctx) const {
  71. WriteValues<TWrapTraits<Kind>::Wrap1, TWrapTraits<Kind>::Wrap2>(payload1, payload2, resList, ctx);
  72. }
  73. template <bool WrapAsOptional1, bool WrapAsOptional2>
  74. void WriteValues(const NUdf::TUnboxedValuePod& payload1, const NUdf::TUnboxedValuePod& payload2,
  75. TDefaultListRepresentation& resList, TComputationContext& ctx) const {
  76. const bool isMulti1 = IsMulti1 && bool(payload1);
  77. const bool isMulti2 = IsMulti2 && bool(payload2);
  78. if (!isMulti1 && !isMulti2) {
  79. WriteTuple<WrapAsOptional1, WrapAsOptional2>(payload1, payload2, resList, ctx);
  80. } else if (isMulti1 && !isMulti2) {
  81. const auto it = payload1.GetListIterator();
  82. for (NUdf::TUnboxedValue item1; it.Next(item1);) {
  83. WriteTuple<WrapAsOptional1, WrapAsOptional2>(item1, payload2, resList, ctx);
  84. }
  85. } else if (!isMulti1 && isMulti2) {
  86. const auto it = payload2.GetListIterator();
  87. for (NUdf::TUnboxedValue item2; it.Next(item2);) {
  88. WriteTuple<WrapAsOptional1, WrapAsOptional2>(payload1, item2, resList, ctx);
  89. }
  90. } else {
  91. const auto it1 = payload1.GetListIterator();
  92. for (NUdf::TUnboxedValue item1; it1.Next(item1);) {
  93. const auto it2 = payload2.GetListIterator();
  94. for (NUdf::TUnboxedValue item2; it2.Next(item2);) {
  95. WriteTuple<WrapAsOptional1, WrapAsOptional2>(item1, item2, resList, ctx);
  96. }
  97. }
  98. }
  99. }
  100. template <bool WrapAsOptional1, bool WrapAsOptional2>
  101. void WriteTuple(const NUdf::TUnboxedValuePod& val1, const NUdf::TUnboxedValuePod& val2,
  102. TDefaultListRepresentation& resList, TComputationContext& ctx) const {
  103. NUdf::TUnboxedValue* itemsPtr = nullptr;
  104. auto tuple = ctx.HolderFactory.CreateDirectArrayHolder(2, itemsPtr);
  105. itemsPtr[0] = val1 ? val1.MakeOptionalIf<WrapAsOptional1>() : NUdf::TUnboxedValuePod(val1);
  106. itemsPtr[1] = val2 ? val2.MakeOptionalIf<WrapAsOptional2>() : NUdf::TUnboxedValuePod(val2);
  107. resList = resList.Append(std::move(tuple));
  108. }
  109. NUdf::TUnboxedValuePod JoinDicts(TComputationContext& ctx, const NUdf::TUnboxedValuePod dict1, const NUdf::TUnboxedValuePod dict2) const {
  110. TDefaultListRepresentation resList;
  111. switch (JoinKind) {
  112. case EJoinKind::Inner:
  113. if (dict1.GetDictLength() < dict2.GetDictLength()) {
  114. // traverse dict1, lookup dict2
  115. const auto it = dict1.GetDictIterator();
  116. for (NUdf::TUnboxedValue key1, payload1; it.NextPair(key1, payload1);) {
  117. Y_DEBUG_ABORT_UNLESS(!HasNullInKey(key1));
  118. if (const auto lookup2 = dict2.Lookup(key1)) {
  119. WriteValuesImpl<EJoinKind::Inner>(payload1, lookup2, resList, ctx);
  120. }
  121. }
  122. } else {
  123. // traverse dict2, lookup dict1
  124. const auto it = dict2.GetDictIterator();
  125. for (NUdf::TUnboxedValue key2, payload2; it.NextPair(key2, payload2);) {
  126. Y_DEBUG_ABORT_UNLESS(!HasNullInKey(key2));
  127. if (const auto lookup1 = dict1.Lookup(key2)) {
  128. WriteValuesImpl<EJoinKind::Inner>(lookup1, payload2, resList, ctx);
  129. }
  130. }
  131. }
  132. break;
  133. case EJoinKind::Left: {
  134. // traverse dict1, lookup dict2
  135. const auto it = dict1.GetDictIterator();
  136. for (NUdf::TUnboxedValue key1, payload1; it.NextPair(key1, payload1);) {
  137. auto lookup2 = HasNullInKey(key1) ? NUdf::TUnboxedValue() : dict2.Lookup(key1);
  138. lookup2 = lookup2 ? lookup2.Release().GetOptionalValue() : NUdf::TUnboxedValuePod();
  139. WriteValuesImpl<EJoinKind::Left>(payload1, lookup2, resList, ctx);
  140. }
  141. }
  142. break;
  143. case EJoinKind::Right: {
  144. // traverse dict2, lookup dict1
  145. const auto it = dict2.GetDictIterator();
  146. for (NUdf::TUnboxedValue key2, payload2; it.NextPair(key2, payload2);) {
  147. auto lookup1 = HasNullInKey(key2) ? NUdf::TUnboxedValue() : dict1.Lookup(key2);
  148. lookup1 = lookup1 ? lookup1.Release().GetOptionalValue() : NUdf::TUnboxedValuePod();
  149. WriteValuesImpl<EJoinKind::Right>(lookup1, payload2, resList, ctx);
  150. }
  151. }
  152. break;
  153. case EJoinKind::Full: {
  154. // traverse dict1, lookup dict2 - as Left
  155. const auto it = dict1.GetDictIterator();
  156. for (NUdf::TUnboxedValue key1, payload1; it.NextPair(key1, payload1);) {
  157. auto lookup2 = HasNullInKey(key1) ? NUdf::TUnboxedValue() : dict2.Lookup(key1);
  158. lookup2 = lookup2 ? lookup2.Release().GetOptionalValue() : NUdf::TUnboxedValuePod();
  159. WriteValuesImpl<EJoinKind::Full>(payload1, lookup2, resList, ctx);
  160. }
  161. }
  162. {
  163. // traverse dict2, lookup dict1 - avoid Inner
  164. const auto it = dict2.GetDictIterator();
  165. for (NUdf::TUnboxedValue key2, payload2; it.NextPair(key2, payload2);) {
  166. if (HasNullInKey(key2) || !dict1.Contains(key2)) {
  167. WriteValuesImpl<EJoinKind::Full>(NUdf::TUnboxedValuePod(), payload2, resList, ctx);
  168. }
  169. }
  170. }
  171. break;
  172. case EJoinKind::LeftOnly: {
  173. const auto it = dict1.GetDictIterator();
  174. for (NUdf::TUnboxedValue key1, payload1; it.NextPair(key1, payload1);) {
  175. if (HasNullInKey(key1) || !dict2.Contains(key1)) {
  176. if (IsMulti1) {
  177. TThresher<false>::DoForEachItem(payload1,
  178. [&resList] (NUdf::TUnboxedValue&& item) {
  179. resList = resList.Append(std::move(item));
  180. }
  181. );
  182. } else {
  183. resList = resList.Append(std::move(payload1));
  184. }
  185. }
  186. }
  187. }
  188. break;
  189. case EJoinKind::RightOnly: {
  190. const auto it = dict2.GetDictIterator();
  191. for (NUdf::TUnboxedValue key2, payload2; it.NextPair(key2, payload2);) {
  192. if (HasNullInKey(key2) || !dict1.Contains(key2)) {
  193. if (IsMulti2) {
  194. TThresher<false>::DoForEachItem(payload2,
  195. [&resList] (NUdf::TUnboxedValue&& item) {
  196. resList = resList.Append(std::move(item));
  197. }
  198. );
  199. } else {
  200. resList = resList.Append(std::move(payload2));
  201. }
  202. }
  203. }
  204. }
  205. break;
  206. case EJoinKind::Exclusion: {
  207. // traverse dict1, lookup dict2 - avoid Inner
  208. const auto it = dict1.GetDictIterator();
  209. for (NUdf::TUnboxedValue key1, payload1; it.NextPair(key1, payload1);) {
  210. if (HasNullInKey(key1) || !dict2.Contains(key1)) {
  211. WriteValuesImpl<EJoinKind::Exclusion>(payload1, NUdf::TUnboxedValuePod(), resList, ctx);
  212. }
  213. }
  214. }
  215. {
  216. // traverse dict2, lookup dict1 - avoid Inner
  217. const auto it = dict2.GetDictIterator();
  218. for (NUdf::TUnboxedValue key2, payload2; it.NextPair(key2, payload2);) {
  219. if (HasNullInKey(key2) || !dict1.Contains(key2)) {
  220. WriteValuesImpl<EJoinKind::Exclusion>(NUdf::TUnboxedValuePod(), payload2, resList, ctx);
  221. }
  222. }
  223. }
  224. break;
  225. case EJoinKind::LeftSemi: {
  226. const auto it = dict1.GetDictIterator();
  227. for (NUdf::TUnboxedValue key1, payload1; it.NextPair(key1, payload1);) {
  228. Y_DEBUG_ABORT_UNLESS(!HasNullInKey(key1));
  229. if (dict2.Contains(key1)) {
  230. if (IsMulti1) {
  231. TThresher<false>::DoForEachItem(payload1,
  232. [&resList] (NUdf::TUnboxedValue&& item) {
  233. resList = resList.Append(std::move(item));
  234. }
  235. );
  236. } else {
  237. resList = resList.Append(std::move(payload1));
  238. }
  239. }
  240. }
  241. }
  242. break;
  243. case EJoinKind::RightSemi: {
  244. const auto it = dict2.GetDictIterator();
  245. for (NUdf::TUnboxedValue key2, payload2; it.NextPair(key2, payload2);) {
  246. Y_DEBUG_ABORT_UNLESS(!HasNullInKey(key2));
  247. if (dict1.Contains(key2)) {
  248. if (IsMulti2) {
  249. TThresher<false>::DoForEachItem(payload2,
  250. [&resList] (NUdf::TUnboxedValue&& item) {
  251. resList = resList.Append(std::move(item));
  252. }
  253. );
  254. } else {
  255. resList = resList.Append(std::move(payload2));
  256. }
  257. }
  258. }
  259. }
  260. break;
  261. default:
  262. Y_ABORT("Unknown kind");
  263. }
  264. return ctx.HolderFactory.CreateDirectListHolder(std::move(resList));
  265. }
  266. IComputationNode* const Dict1;
  267. IComputationNode* const Dict2;
  268. const bool IsMulti1;
  269. const bool IsMulti2;
  270. const EJoinKind JoinKind;
  271. const std::vector<ui32> OptIndicies;
  272. };
  273. }
  274. IComputationNode* WrapJoinDict(TCallable& callable, const TComputationNodeFactoryContext& ctx) {
  275. MKQL_ENSURE(callable.GetInputsCount() == 5, "Expected 5 args");
  276. const auto dict1node = callable.GetInput(0);
  277. const auto dict2node = callable.GetInput(1);
  278. const auto isMulti1Node = callable.GetInput(2);
  279. const auto isMulti2Node = callable.GetInput(3);
  280. const auto joinKindNode = callable.GetInput(4);
  281. const auto dict1type = AS_TYPE(TDictType, dict1node);
  282. const auto dict2type = AS_TYPE(TDictType, dict2node);
  283. const auto keyType = dict1type->GetKeyType();
  284. MKQL_ENSURE(keyType->IsSameType(*dict2type->GetKeyType()), "Dict key types must be the same");
  285. const bool multi1 = AS_VALUE(TDataLiteral, isMulti1Node)->AsValue().Get<bool>();
  286. const bool multi2 = AS_VALUE(TDataLiteral, isMulti2Node)->AsValue().Get<bool>();
  287. const ui32 rawKind = AS_VALUE(TDataLiteral, joinKindNode)->AsValue().Get<ui32>();
  288. const auto dict1 = LocateNode(ctx.NodeLocator, callable, 0);
  289. const auto dict2 = LocateNode(ctx.NodeLocator, callable, 1);
  290. if (keyType->IsTuple()) {
  291. const auto tupleType = AS_TYPE(TTupleType, keyType);
  292. std::vector<ui32> indicies;
  293. indicies.reserve(tupleType->GetElementsCount());
  294. for (ui32 i = 0U; i < tupleType->GetElementsCount(); ++i) {
  295. if (tupleType->GetElementType(i)->IsOptional()) {
  296. indicies.emplace_back(i);
  297. }
  298. }
  299. return new TJoinDictWrapper<true>(ctx.Mutables, dict1, dict2, multi1, multi2, GetJoinKind(rawKind), std::move(indicies));
  300. } else {
  301. return new TJoinDictWrapper<false>(ctx.Mutables, dict1, dict2, multi1, multi2, GetJoinKind(rawKind));
  302. }
  303. }
  304. }
  305. }