Browse Source

Fix nulls bitmap size when number of columns is more than 64 (YQL-17952) (#2422)

Andrey Kulaga 1 year ago
parent
commit
bad3a1ee30

+ 1 - 1
ydb/library/yql/minikql/comp_nodes/mkql_grace_join.cpp

@@ -272,7 +272,7 @@ void TGraceJoinPacker::Pack()  {
         case NUdf::EDataSlot::Interval:
             WriteUnaligned<i64>(buffPtr, value.Get<i64>()); break;
         case NUdf::EDataSlot::Date32:
-            WriteUnaligned<i64>(buffPtr, value.Get<i32>()); break;
+            WriteUnaligned<i32>(buffPtr, value.Get<i32>()); break;
         case NUdf::EDataSlot::Datetime64:
             WriteUnaligned<i64>(buffPtr, value.Get<i64>()); break;
         case NUdf::EDataSlot::Timestamp64:

+ 21 - 8
ydb/library/yql/minikql/comp_nodes/mkql_grace_join_imp.cpp

@@ -76,7 +76,7 @@ void TTable::AddTuple(  ui64 * intColumns, char ** stringColumns, ui32 * strings
     }
 
 
-    XXH64_hash_t hash = XXH64(TempTuple.data(), TempTuple.size() * sizeof(ui64), 0);
+    XXH64_hash_t hash = XXH64(TempTuple.data() + NullsBitmapSize_, (TempTuple.size() - NullsBitmapSize_) * sizeof(ui64), 0);
 
     if (!hash) hash = 1;
 
@@ -298,6 +298,8 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
         std::swap(JoinTable1, JoinTable2);
     }
 
+    ui64 tuplesFound = 0;
+
     std::vector<ui64, TMKQLAllocator<ui64, EMemorySubPool::Temporary>> joinSlots, spillSlots, slotToIdx;
     std::vector<ui32, TMKQLAllocator<ui32, EMemorySubPool::Temporary>> stringsOffsets1, stringsOffsets2;
     ui64 reservedSize = 6 * (DefaultTupleBytes * DefaultTuplesNum) / sizeof(ui64);
@@ -320,12 +322,19 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
         ui64 nullsSize2 = JoinTable2->NullsBitmapSize_;
         ui64 keyIntOffset1 = HashSize + nullsSize1;
         ui64 keyIntOffset2 = HashSize + nullsSize2;
+        bool table1HasKeyStringColumns = (JoinTable1->NumberOfKeyStringColumns != 0);
+        bool table2HasKeyStringColumns = (JoinTable2->NumberOfKeyStringColumns != 0);
+        bool table1HasKeyIColumns = (JoinTable1->NumberOfKeyIColumns != 0);
+        bool table2HasKeyIColumns = (JoinTable2->NumberOfKeyIColumns != 0);
+
 
         if ( bucket2->TuplesNum > bucket1->TuplesNum ) {
             std::swap(bucket1, bucket2);
             std::swap(headerSize1, headerSize2);
             std::swap(nullsSize1, nullsSize2);
             std::swap(keyIntOffset1, keyIntOffset2);
+            std::swap(table1HasKeyStringColumns, table2HasKeyStringColumns);
+            std::swap(table1HasKeyIColumns, table2HasKeyIColumns);
        }
 
         joinResults.reserve(3 * bucket1->TuplesNum );
@@ -334,7 +343,7 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
 
         ui64 avgStringsSize = ( 3 * (bucket2->KeyIntVals.size() - bucket2->TuplesNum * headerSize2) ) / ( 2 * bucket2->TuplesNum + 1)  + 1;
 
-        if (JoinTable1->NumberOfKeyStringColumns != 0 || JoinTable1->NumberOfKeyIColumns != 0) {
+        if (table2HasKeyStringColumns || table2HasKeyIColumns ) {
             slotSize = slotSize + avgStringsSize;
         }
 
@@ -351,7 +360,7 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
         while (it2 != bucket2->KeyIntVals.end() ) {
 
             ui64 keysValSize;
-            if ( JoinTable2->NumberOfKeyStringColumns > 0 || JoinTable2->NumberOfKeyIColumns > 0) {
+            if ( table2HasKeyStringColumns || table2HasKeyIColumns) {
                 keysValSize = headerSize2 + *(it2 + headerSize2 - 1) ;
             } else {
                 keysValSize = headerSize2;
@@ -396,7 +405,7 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
         while ( it1 < bucket1->KeyIntVals.end() ) {
 
             ui64 keysValSize;
-            if ( JoinTable1->NumberOfKeyStringColumns > 0 || JoinTable1->NumberOfKeyIColumns > 0) {
+            if ( table1HasKeyStringColumns || table1HasKeyIColumns ) {
                 keysValSize = headerSize1 + *(it1 + headerSize1 - 1) ;
             } else {
                 keysValSize = headerSize1;
@@ -418,24 +427,26 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
             {
 
                 bool matchFound = false;
-                if (((keysValSize - nullsSize1) <= (slotSize - nullsSize2)) && !JoinTable1->NumberOfKeyIColumns ) {
+                if (((keysValSize - nullsSize1) <= (slotSize - nullsSize2)) && !table1HasKeyIColumns ) {
                     if (std::equal(it1 + keyIntOffset1, it1 + keysValSize, slotIt + keyIntOffset2)) {
+                        tuplesFound++;
                         matchFound = true;
                     }
                 }
 
-                if (((keysValSize - nullsSize1) > (slotSize - nullsSize2)) && !JoinTable1->NumberOfKeyIColumns ) {
+                if (((keysValSize - nullsSize1) > (slotSize - nullsSize2)) && !table1HasKeyIColumns) {
                     if (std::equal(it1 + keyIntOffset1, it1 + headerSize1, slotIt + keyIntOffset2)) {
                         ui64 stringsPos = *(slotIt + headerSize2);
                         ui64 stringsSize = *(it1 + headerSize1 - 1);
                         if (std::equal(it1 + headerSize1, it1 + headerSize1 + stringsSize, spillSlots.begin() + stringsPos)) {
+                            tuplesFound++;
                             matchFound = true;
                         }
                     }
                 }
 
  
-                if (JoinTable1->NumberOfKeyIColumns)
+                if (table1HasKeyIColumns)
                 {
                     bool headerMatch = false;
                     bool stringsMatch = false;
@@ -452,7 +463,7 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
                         slotStringsStart = spillSlots.begin() + stringsPos;
                     }
 
-                    if ( JoinTable1->NumberOfKeyStringColumns == 0) {
+                    if ( !table1HasKeyStringColumns) {
                         stringsMatch = true;
                     } else {
                         ui64 stringsSize = *(it1 + headerSize1 - 1);
@@ -479,6 +490,7 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
                     }
 
                     if (headerMatch && stringsMatch && iValuesMatch) {
+                        tuplesFound++;
                         matchFound = true;
                     }
 
@@ -556,6 +568,7 @@ void TTable::Join( TTable & t1, TTable & t2, EJoinKind joinKind, bool hasMoreLef
     HasMoreLeftTuples_ = hasMoreLeftTuples;
     HasMoreRightTuples_ = hasMoreRightTuples;
 
+    TuplesFound_ += tuplesFound;
     
 }
 

+ 2 - 0
ydb/library/yql/minikql/comp_nodes/mkql_grace_join_imp.h

@@ -169,6 +169,8 @@ class TTable {
 
     bool Table2Initialized_ = false;    // True when iterator counters for second table already initialized
 
+    ui64 TuplesFound_ = 0; // Total number of matching keys found during join
+
 public:
 
     // Adds new tuple to the table.  intColumns, stringColumns - data of columns, 

+ 3 - 3
ydb/library/yql/minikql/comp_nodes/ut/mkql_grace_join_ut.cpp

@@ -1522,12 +1522,12 @@ Y_UNIT_TEST_SUITE(TMiniKQLGraceJoinTest) {
             const auto iterator = graph->GetValue().GetListIterator();
             NUdf::TUnboxedValue tuple;
 
-            UNIT_ASSERT(iterator.Next(tuple));
-            UNBOXED_VALUE_STR_EQUAL(tuple.GetElement(0), "X");
-            UNIT_ASSERT(!tuple.GetElement(1));
             UNIT_ASSERT(iterator.Next(tuple));
             UNBOXED_VALUE_STR_EQUAL(tuple.GetElement(0), "A");
             UNIT_ASSERT_VALUES_EQUAL(tuple.GetElement(1).Get<ui32>(), 1);
+            UNIT_ASSERT(iterator.Next(tuple));
+            UNBOXED_VALUE_STR_EQUAL(tuple.GetElement(0), "X");
+            UNIT_ASSERT(!tuple.GetElement(1));
             UNIT_ASSERT(!iterator.Next(tuple));
             UNIT_ASSERT(!iterator.Next(tuple));
         }

+ 11 - 11
ydb/library/yql/tests/sql/dq_file/part3/canondata/result.json

@@ -956,30 +956,30 @@
     "test.test[join-count_bans--Results]": [],
     "test.test[join-grace_join2--Analyze]": [
         {
-            "checksum": "45db7c8306c9626a640bcb81c9c76780",
-            "size": 4462,
-            "uri": "https://{canondata_backend}/1599023/ee6490b3365cf6b396283cb8bd07f94ceff767b4/resource.tar.gz#test.test_join-grace_join2--Analyze_/plan.txt"
+            "checksum": "759025fd6317614a253eae816ff5941d",
+            "size": 5059,
+            "uri": "https://{canondata_backend}/1923547/c3f064ea25dafaabdc78d527cb888e8c29c155df/resource.tar.gz#test.test_join-grace_join2--Analyze_/plan.txt"
         }
     ],
     "test.test[join-grace_join2--Debug]": [
         {
-            "checksum": "0684948a27f55b655c998444a9060053",
-            "size": 1890,
-            "uri": "https://{canondata_backend}/1599023/ee6490b3365cf6b396283cb8bd07f94ceff767b4/resource.tar.gz#test.test_join-grace_join2--Debug_/opt.yql_patched"
+            "checksum": "34fdff009f1cfcdc53164eeb5db58dd7",
+            "size": 2171,
+            "uri": "https://{canondata_backend}/1923547/c3f064ea25dafaabdc78d527cb888e8c29c155df/resource.tar.gz#test.test_join-grace_join2--Debug_/opt.yql_patched"
         }
     ],
     "test.test[join-grace_join2--Plan]": [
         {
-            "checksum": "45db7c8306c9626a640bcb81c9c76780",
-            "size": 4462,
-            "uri": "https://{canondata_backend}/1599023/ee6490b3365cf6b396283cb8bd07f94ceff767b4/resource.tar.gz#test.test_join-grace_join2--Plan_/plan.txt"
+            "checksum": "759025fd6317614a253eae816ff5941d",
+            "size": 5059,
+            "uri": "https://{canondata_backend}/1923547/c3f064ea25dafaabdc78d527cb888e8c29c155df/resource.tar.gz#test.test_join-grace_join2--Plan_/plan.txt"
         }
     ],
     "test.test[join-grace_join2--Results]": [
         {
-            "checksum": "65a9b307bc9899b17f61962a5d4a49fb",
+            "checksum": "2ad0b4f3207032d285d5f99430e9abaf",
             "size": 5737,
-            "uri": "https://{canondata_backend}/1899731/149477001e0a8762e03fe5262dd2d939b716f0bf/resource.tar.gz#test.test_join-grace_join2--Results_/results.txt"
+            "uri": "https://{canondata_backend}/1923547/c3f064ea25dafaabdc78d527cb888e8c29c155df/resource.tar.gz#test.test_join-grace_join2--Results_/results.txt"
         }
     ],
     "test.test[join-inmem_by_uncomparable_structs--Analyze]": [

+ 9 - 9
ydb/library/yql/tests/sql/sql2yql/canondata/result.json

@@ -7512,9 +7512,9 @@
     ],
     "test_sql2yql.test[join-grace_join2]": [
         {
-            "checksum": "4909542187f7c74060abc053d5707f26",
-            "size": 1627,
-            "uri": "https://{canondata_backend}/1942278/d84f6d9ab025b27e11f463124468076d499ed9b3/resource.tar.gz#test_sql2yql.test_join-grace_join2_/sql.yql"
+            "checksum": "dec15765d9200297261bb22775ec5338",
+            "size": 1782,
+            "uri": "https://{canondata_backend}/1871182/e726c72e47d3c077e5ba351b53dba460544020da/resource.tar.gz#test_sql2yql.test_join-grace_join2_/sql.yql"
         }
     ],
     "test_sql2yql.test[join-group_compact_by]": [
@@ -25243,9 +25243,9 @@
     ],
     "test_sql_format.test[join-grace_join2]": [
         {
-            "checksum": "4946227ff929407fc62f749ef756ef4d",
-            "size": 185,
-            "uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_join-grace_join2_/formatted.sql"
+            "checksum": "7656454a9434ff51ab800908ae346c42",
+            "size": 233,
+            "uri": "https://{canondata_backend}/1871182/e726c72e47d3c077e5ba351b53dba460544020da/resource.tar.gz#test_sql_format.test_join-grace_join2_/formatted.sql"
         }
     ],
     "test_sql_format.test[join-group_compact_by]": [
@@ -26013,9 +26013,9 @@
     ],
     "test_sql_format.test[join-nopushdown_filter_with_depends_on]": [
         {
-            "checksum": "7c0b7c120f321f9b415663ece29a09cd",
-            "size": 247,
-            "uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_join-nopushdown_filter_with_depends_on_/formatted.sql"
+            "checksum": "956eea7d7ef4126950ed02a322c6c492",
+            "size": 272,
+            "uri": "https://{canondata_backend}/212715/1c52a4632d14126361f7585c218d202718c6fa0f/resource.tar.gz#test_sql_format.test_join-nopushdown_filter_with_depends_on_/formatted.sql"
         }
     ],
     "test_sql_format.test[join-opt_on_opt_side]": [

+ 1 - 1
ydb/library/yql/tests/sql/suites/join/grace_join2.sql

@@ -7,4 +7,4 @@ from
 plato.customers1 as c1
 join
 plato.customers1 as c2
-on c1.country_id = c2.country_id;
+on c1.country_id = c2.country_id order by c1.customer_id, c2.customer_id;

+ 1 - 0
ydb/library/yql/tests/sql/suites/join/nopushdown_filter_with_depends_on.sql

@@ -1,4 +1,5 @@
 /* postgres can not */
+/* hybridfile can not  */
 /* custom check: len(yt_res_yson[0]['Write'][0]['Data']) < 4 */
 use plato;