Просмотр исходного кода

remove blob columns not present in current scheme

Semyon 6 месяцев назад
Родитель
Сommit
65e10f59b3

+ 0 - 1
.github/config/muted_ya.txt

@@ -17,7 +17,6 @@ ydb/core/kqp/ut/query KqpLimits.QueryReplySize
 ydb/core/kqp/ut/query KqpQuery.QueryTimeout
 ydb/core/kqp/ut/query KqpQuery.QueryTimeout
 ydb/core/kqp/ut/scan KqpRequestContext.TraceIdInErrorMessage
 ydb/core/kqp/ut/scan KqpRequestContext.TraceIdInErrorMessage
 ydb/core/kqp/ut/scheme [*/*]*
 ydb/core/kqp/ut/scheme [*/*]*
-ydb/core/kqp/ut/scheme KqpOlapScheme.DropThenAddColumn
 ydb/core/kqp/ut/scheme KqpOlapScheme.TenThousandColumns
 ydb/core/kqp/ut/scheme KqpOlapScheme.TenThousandColumns
 ydb/core/kqp/ut/scheme KqpScheme.AlterAsyncReplication
 ydb/core/kqp/ut/scheme KqpScheme.AlterAsyncReplication
 ydb/core/kqp/ut/scheme KqpScheme.QueryWithAlter
 ydb/core/kqp/ut/scheme KqpScheme.QueryWithAlter

+ 6 - 0
ydb/core/formats/arrow/common/container.cpp

@@ -1,5 +1,6 @@
 #include "container.h"
 #include "container.h"
 
 
+#include <ydb/core/formats/arrow/common/vector_operations.h>
 #include <ydb/core/formats/arrow/accessor/plain/accessor.h>
 #include <ydb/core/formats/arrow/accessor/plain/accessor.h>
 #include <ydb/core/formats/arrow/arrow_helpers.h>
 #include <ydb/core/formats/arrow/arrow_helpers.h>
 #include <ydb/core/formats/arrow/simple_arrays_cache.h>
 #include <ydb/core/formats/arrow/simple_arrays_cache.h>
@@ -59,6 +60,11 @@ TConclusionStatus TGeneralContainer::AddField(const std::shared_ptr<arrow::Field
     return AddField(f, std::make_shared<NAccessor::TTrivialArray>(data));
     return AddField(f, std::make_shared<NAccessor::TTrivialArray>(data));
 }
 }
 
 
+void TGeneralContainer::DeleteFieldsByIndex(const std::vector<ui32>& idxs) {
+    Schema->DeleteFieldsByIndex(idxs);
+    NUtil::EraseItems(Columns, idxs);
+}
+
 void TGeneralContainer::Initialize() {
 void TGeneralContainer::Initialize() {
     std::optional<ui64> recordsCount;
     std::optional<ui64> recordsCount;
     AFL_VERIFY(Schema->num_fields() == (i32)Columns.size())("schema", Schema->num_fields())("columns", Columns.size());
     AFL_VERIFY(Schema->num_fields() == (i32)Columns.size())("schema", Schema->num_fields())("columns", Columns.size());

+ 2 - 0
ydb/core/formats/arrow/common/container.h

@@ -74,6 +74,8 @@ public:
 
 
     [[nodiscard]] TConclusionStatus AddField(const std::shared_ptr<arrow::Field>& f, const std::shared_ptr<arrow::ChunkedArray>& data);
     [[nodiscard]] TConclusionStatus AddField(const std::shared_ptr<arrow::Field>& f, const std::shared_ptr<arrow::ChunkedArray>& data);
 
 
+    void DeleteFieldsByIndex(const std::vector<ui32>& idxs);
+
     TGeneralContainer(const std::shared_ptr<arrow::Table>& table);
     TGeneralContainer(const std::shared_ptr<arrow::Table>& table);
     TGeneralContainer(const std::shared_ptr<arrow::RecordBatch>& table);
     TGeneralContainer(const std::shared_ptr<arrow::RecordBatch>& table);
     TGeneralContainer(const std::shared_ptr<arrow::Schema>& schema, std::vector<std::shared_ptr<NAccessor::IChunkedArray>>&& columns);
     TGeneralContainer(const std::shared_ptr<arrow::Schema>& schema, std::vector<std::shared_ptr<NAccessor::IChunkedArray>>&& columns);

+ 54 - 0
ydb/core/formats/arrow/common/vector_operations.h

@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ydb/library/actors/core/log.h>
+
+#include <util/system/types.h>
+
+#include <vector>
+
+namespace NKikimr::NArrow::NUtil {
+
+template <typename T>
+class TDefaultErasePolicy {
+public:
+    void OnEraseItem(const T& /*item*/) const {
+    }
+    void OnMoveItem(const T& /*item*/, const ui64 /*new_index*/) const {
+    }
+};
+
+template <typename T, typename ErasePolicy = TDefaultErasePolicy<T>>
+void EraseItems(std::vector<T>& container, const std::vector<ui32>& idxsToErase, const ErasePolicy& policy = TDefaultErasePolicy<T>()) {
+    if (idxsToErase.empty()) {
+        return;
+    }
+    AFL_VERIFY(idxsToErase.front() < container.size());
+
+    auto itNextEraseIdx = idxsToErase.begin();
+    ui64 writeIdx = idxsToErase.front();
+    ui64 readIdx = idxsToErase.front();
+    while (readIdx != container.size()) {
+        AFL_VERIFY(itNextEraseIdx != idxsToErase.end() && readIdx == *itNextEraseIdx);
+
+        policy.OnEraseItem(container[readIdx]);
+        ++readIdx;
+        ++itNextEraseIdx;
+        if (itNextEraseIdx != idxsToErase.end()) {
+            AFL_VERIFY(*itNextEraseIdx > *std::prev(itNextEraseIdx));
+            AFL_VERIFY(*itNextEraseIdx < container.size());
+        }
+
+        const ui64 nextReadIdx = itNextEraseIdx == idxsToErase.end() ? container.size() : *itNextEraseIdx;
+        while (readIdx != nextReadIdx) {
+            std::swap(container[writeIdx], container[readIdx]);
+            policy.OnMoveItem(container[writeIdx], writeIdx);
+            ++writeIdx;
+            ++readIdx;
+        }
+    }
+
+    container.resize(writeIdx);
+    AFL_VERIFY(itNextEraseIdx == idxsToErase.end());
+}
+
+}   // namespace NKikimr::NArrow::NUtil

+ 7 - 0
ydb/core/formats/arrow/modifier/schema.cpp

@@ -1,5 +1,6 @@
 #include "schema.h"
 #include "schema.h"
 #include <util/string/builder.h>
 #include <util/string/builder.h>
+#include <ydb/core/formats/arrow/common/vector_operations.h>
 #include <ydb/library/actors/core/log.h>
 #include <ydb/library/actors/core/log.h>
 
 
 namespace NKikimr::NArrow::NModifier {
 namespace NKikimr::NArrow::NModifier {
@@ -29,6 +30,12 @@ TConclusionStatus TSchema::AddField(const std::shared_ptr<arrow::Field>& f) {
     return TConclusionStatus::Success();
     return TConclusionStatus::Success();
 }
 }
 
 
+void TSchema::DeleteFieldsByIndex(const std::vector<ui32>& idxs) {
+    AFL_VERIFY(Initialized);
+    AFL_VERIFY(!Finished);
+    NUtil::EraseItems(Fields, idxs, TFieldsErasePolicy(this));
+}
+
 TString TSchema::ToString() const {
 TString TSchema::ToString() const {
     TStringBuilder result;
     TStringBuilder result;
     for (auto&& i : Fields) {
     for (auto&& i : Fields) {

+ 23 - 0
ydb/core/formats/arrow/modifier/schema.h

@@ -1,4 +1,5 @@
 #pragma once
 #pragma once
+#include <ydb/library/actors/core/log.h>
 #include <ydb/library/conclusion/status.h>
 #include <ydb/library/conclusion/status.h>
 #include <contrib/libs/apache/arrow/cpp/src/arrow/type.h>
 #include <contrib/libs/apache/arrow/cpp/src/arrow/type.h>
 #include <util/generic/hash.h>
 #include <util/generic/hash.h>
@@ -39,6 +40,7 @@ public:
     std::shared_ptr<arrow::Schema> Finish();
     std::shared_ptr<arrow::Schema> Finish();
     [[nodiscard]] TConclusionStatus AddField(const std::shared_ptr<arrow::Field>& f);
     [[nodiscard]] TConclusionStatus AddField(const std::shared_ptr<arrow::Field>& f);
     const std::shared_ptr<arrow::Field>& GetFieldByName(const std::string& name) const;
     const std::shared_ptr<arrow::Field>& GetFieldByName(const std::string& name) const;
+    void DeleteFieldsByIndex(const std::vector<ui32>& idxs);
 
 
     bool HasField(const std::string& name) const {
     bool HasField(const std::string& name) const {
         return IndexByName.contains(name);
         return IndexByName.contains(name);
@@ -51,5 +53,26 @@ public:
     const std::shared_ptr<arrow::Field>& GetFieldVerified(const ui32 index) const;
     const std::shared_ptr<arrow::Field>& GetFieldVerified(const ui32 index) const;
 
 
     const std::shared_ptr<arrow::Field>& field(const ui32 index) const;
     const std::shared_ptr<arrow::Field>& field(const ui32 index) const;
+
+private:
+    class TFieldsErasePolicy {
+    private:
+        TSchema* const Owner;
+
+    public:
+        TFieldsErasePolicy(TSchema* const owner)
+            : Owner(owner) {
+        }
+
+        void OnEraseItem(const std::shared_ptr<arrow::Field>& item) const {
+            Owner->IndexByName.erase(item->name());
+        }
+
+        void OnMoveItem(const std::shared_ptr<arrow::Field>& item, const ui64 new_index) const {
+            auto* findField = Owner->IndexByName.FindPtr(item->name());
+            AFL_VERIFY(findField);
+            *findField = new_index;
+        }
+    };
 };
 };
 }
 }

+ 34 - 12
ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp

@@ -7849,7 +7849,11 @@ Y_UNIT_TEST_SUITE(KqpOlapScheme) {
         testHelper.ReadData("SELECT * FROM `/Root/ColumnTableTest` WHERE id=1", "[[1;#;[\"test_res_1\"]]]");
         testHelper.ReadData("SELECT * FROM `/Root/ColumnTableTest` WHERE id=1", "[[1;#;[\"test_res_1\"]]]");
     }
     }
 
 
-    Y_UNIT_TEST(DropThenAddColumn) {
+    void TestDropThenAddColumn(bool enableIndexation, bool enableCompaction) {
+        if (enableCompaction) {
+            Y_ABORT_UNLESS(enableIndexation);
+        }
+
         auto csController = NYDBTest::TControllers::RegisterCSControllerGuard<NOlap::TWaitCompactionController>();
         auto csController = NYDBTest::TControllers::RegisterCSControllerGuard<NOlap::TWaitCompactionController>();
         csController->DisableBackground(NYDBTest::ICSController::EBackground::Indexation);
         csController->DisableBackground(NYDBTest::ICSController::EBackground::Indexation);
         csController->DisableBackground(NYDBTest::ICSController::EBackground::Compaction);
         csController->DisableBackground(NYDBTest::ICSController::EBackground::Compaction);
@@ -7874,12 +7878,14 @@ Y_UNIT_TEST_SUITE(KqpOlapScheme) {
             testHelper.BulkUpsert(testTable, tableInserter);
             testHelper.BulkUpsert(testTable, tableInserter);
         }
         }
 
 
-        csController->EnableBackground(NYDBTest::ICSController::EBackground::Indexation);
-        csController->EnableBackground(NYDBTest::ICSController::EBackground::Compaction);
-        csController->WaitIndexation(TDuration::Seconds(5));
-        csController->WaitCompactions(TDuration::Seconds(5));
-        csController->DisableBackground(NYDBTest::ICSController::EBackground::Indexation);
-        csController->DisableBackground(NYDBTest::ICSController::EBackground::Compaction);
+        if (enableCompaction) {
+            csController->EnableBackground(NYDBTest::ICSController::EBackground::Indexation);
+            csController->EnableBackground(NYDBTest::ICSController::EBackground::Compaction);
+            csController->WaitIndexation(TDuration::Seconds(5));
+            csController->WaitCompactions(TDuration::Seconds(5));
+            csController->DisableBackground(NYDBTest::ICSController::EBackground::Indexation);
+            csController->DisableBackground(NYDBTest::ICSController::EBackground::Compaction);
+        }
 
 
         {
         {
             auto alterQuery = TStringBuilder() << "ALTER TABLE `" << testTable.GetName() << "` DROP COLUMN value;";
             auto alterQuery = TStringBuilder() << "ALTER TABLE `" << testTable.GetName() << "` DROP COLUMN value;";
@@ -7900,12 +7906,28 @@ Y_UNIT_TEST_SUITE(KqpOlapScheme) {
             testHelper.BulkUpsert(testTable, tableInserter);
             testHelper.BulkUpsert(testTable, tableInserter);
         }
         }
 
 
-        csController->EnableBackground(NYDBTest::ICSController::EBackground::Indexation);
-        csController->EnableBackground(NYDBTest::ICSController::EBackground::Compaction);
-        csController->WaitIndexation(TDuration::Seconds(5));
-        csController->WaitCompactions(TDuration::Seconds(5));
+        if (enableIndexation) {
+            csController->EnableBackground(NYDBTest::ICSController::EBackground::Indexation);
+            csController->WaitIndexation(TDuration::Seconds(5));
+        }
+        if (enableCompaction) {
+            csController->EnableBackground(NYDBTest::ICSController::EBackground::Compaction);
+            csController->WaitCompactions(TDuration::Seconds(5));
+        }
+
+        testHelper.ReadData("SELECT value FROM `/Root/ColumnTableTest`", "[[#];[#];[[42u]];[[43u]]]");
+    }
+
+    Y_UNIT_TEST(DropThenAddColumn) {
+        TestDropThenAddColumn(false, false);
+    }
+
+    Y_UNIT_TEST(DropThenAddColumnIndexation) {
+        TestDropThenAddColumn(true, true);
+    }
 
 
-        testHelper.ReadData("SELECT * FROM `/Root/ColumnTableTest`", "[[4;#;[\"test_res_1\"]]]");
+    Y_UNIT_TEST(DropThenAddColumnCompaction) {
+        TestDropThenAddColumn(true, true);
     }
     }
 
 
     Y_UNIT_TEST(DropTtlColumn) {
     Y_UNIT_TEST(DropTtlColumn) {

+ 1 - 0
ydb/core/tx/columnshard/engines/changes/indexation.cpp

@@ -235,6 +235,7 @@ TConclusionStatus TInsertColumnEngineChanges::DoConstructBlobs(TConstructionCont
             auto batchSchema =
             auto batchSchema =
                 std::make_shared<arrow::Schema>(inserted.GetMeta().GetSchemaSubset().Apply(blobSchema->GetIndexInfo().ArrowSchema()->fields()));
                 std::make_shared<arrow::Schema>(inserted.GetMeta().GetSchemaSubset().Apply(blobSchema->GetIndexInfo().ArrowSchema()->fields()));
             batch = std::make_shared<NArrow::TGeneralContainer>(NArrow::DeserializeBatch(blobData, batchSchema));
             batch = std::make_shared<NArrow::TGeneralContainer>(NArrow::DeserializeBatch(blobData, batchSchema));
+            blobSchema->AdaptBatchToSchema(*batch, resultSchema);
         }
         }
         IIndexInfo::AddSnapshotColumns(*batch, inserted.GetSnapshot());
         IIndexInfo::AddSnapshotColumns(*batch, inserted.GetSnapshot());
 
 

+ 4 - 1
ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/source.cpp

@@ -215,6 +215,8 @@ bool TCommittedDataSource::DoStartFetchingColumns(
 
 
 void TCommittedDataSource::DoAssembleColumns(const std::shared_ptr<TColumnsSet>& columns) {
 void TCommittedDataSource::DoAssembleColumns(const std::shared_ptr<TColumnsSet>& columns) {
     TMemoryProfileGuard mGuard("SCAN_PROFILE::ASSEMBLER::COMMITTED", IS_DEBUG_LOG_ENABLED(NKikimrServices::TX_COLUMNSHARD_SCAN_MEMORY));
     TMemoryProfileGuard mGuard("SCAN_PROFILE::ASSEMBLER::COMMITTED", IS_DEBUG_LOG_ENABLED(NKikimrServices::TX_COLUMNSHARD_SCAN_MEMORY));
+    const ISnapshotSchema::TPtr batchSchema = GetContext()->GetReadMetadata()->GetIndexVersions().GetSchemaVerified(GetCommitted().GetSchemaVersion());
+    const ISnapshotSchema::TPtr resultSchema = GetContext()->GetReadMetadata()->GetResultSchema();
     if (!GetStageData().GetTable()) {
     if (!GetStageData().GetTable()) {
         AFL_VERIFY(GetStageData().GetBlobs().size() == 1);
         AFL_VERIFY(GetStageData().GetBlobs().size() == 1);
         auto bData = MutableStageData().ExtractBlob(GetStageData().GetBlobs().begin()->first);
         auto bData = MutableStageData().ExtractBlob(GetStageData().GetBlobs().begin()->first);
@@ -222,11 +224,12 @@ void TCommittedDataSource::DoAssembleColumns(const std::shared_ptr<TColumnsSet>&
         auto rBatch = NArrow::DeserializeBatch(bData, std::make_shared<arrow::Schema>(CommittedBlob.GetSchemaSubset().Apply(schema->fields())));
         auto rBatch = NArrow::DeserializeBatch(bData, std::make_shared<arrow::Schema>(CommittedBlob.GetSchemaSubset().Apply(schema->fields())));
         AFL_VERIFY(rBatch)("schema", schema->ToString());
         AFL_VERIFY(rBatch)("schema", schema->ToString());
         auto batch = std::make_shared<NArrow::TGeneralContainer>(rBatch);
         auto batch = std::make_shared<NArrow::TGeneralContainer>(rBatch);
+        batchSchema->AdaptBatchToSchema(*batch, resultSchema);
         GetContext()->GetReadMetadata()->GetIndexInfo().AddSnapshotColumns(*batch, CommittedBlob.GetSnapshotDef(TSnapshot::Zero()));
         GetContext()->GetReadMetadata()->GetIndexInfo().AddSnapshotColumns(*batch, CommittedBlob.GetSnapshotDef(TSnapshot::Zero()));
         GetContext()->GetReadMetadata()->GetIndexInfo().AddDeleteFlagsColumn(*batch, CommittedBlob.GetIsDelete());
         GetContext()->GetReadMetadata()->GetIndexInfo().AddDeleteFlagsColumn(*batch, CommittedBlob.GetIsDelete());
         MutableStageData().AddBatch(batch);
         MutableStageData().AddBatch(batch);
     }
     }
-    MutableStageData().SyncTableColumns(columns->GetSchema()->fields(), *GetContext()->GetReadMetadata()->GetResultSchema());
+    MutableStageData().SyncTableColumns(columns->GetSchema()->fields(), *resultSchema);
 }
 }
 
 
 }   // namespace NKikimr::NOlap::NReader::NPlain
 }   // namespace NKikimr::NOlap::NReader::NPlain

+ 16 - 0
ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp

@@ -140,6 +140,22 @@ TConclusion<std::shared_ptr<arrow::RecordBatch>> ISnapshotSchema::PrepareForModi
     }
     }
 }
 }
 
 
+void ISnapshotSchema::AdaptBatchToSchema(NArrow::TGeneralContainer& batch, const ISnapshotSchema::TPtr& targetSchema) const {
+    if (targetSchema->GetVersion() != GetVersion()) {
+        std::vector<ui32> columnIdxToDelete;
+        for (size_t columnIdx = 0; columnIdx < batch.GetSchema()->GetFields().size(); ++columnIdx) {
+            const std::optional<ui32> targetColumnId = targetSchema->GetColumnIdOptional(batch.GetSchema()->field(columnIdx)->name());
+            const ui32 batchColumnId = GetColumnIdVerified(GetFieldByIndex(columnIdx)->name());
+            if (!targetColumnId || *targetColumnId != batchColumnId) {
+                columnIdxToDelete.emplace_back(columnIdx);
+            }
+        }
+        if (!columnIdxToDelete.empty()) {
+            batch.DeleteFieldsByIndex(columnIdxToDelete);
+        }
+    }
+}
+
 ui32 ISnapshotSchema::GetColumnId(const std::string& columnName) const {
 ui32 ISnapshotSchema::GetColumnId(const std::string& columnName) const {
     auto id = GetColumnIdOptional(columnName);
     auto id = GetColumnIdOptional(columnName);
     AFL_VERIFY(id)("column_name", columnName)("schema", JoinSeq(",", GetSchema()->field_names()));
     AFL_VERIFY(id)("column_name", columnName)("schema", JoinSeq(",", GetSchema()->field_names()));

Некоторые файлы не были показаны из-за большого количества измененных файлов