123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 |
- #include "mkql_block_trimmer.h"
- #include <library/cpp/testing/unittest/registar.h>
- #include <yql/essentials/public/udf/arrow/block_builder.h>
- #include <yql/essentials/public/udf/arrow/block_reader.h>
- #include <yql/essentials/public/udf/arrow/memory_pool.h>
- #include <yql/essentials/minikql/mkql_type_builder.h>
- #include <yql/essentials/minikql/mkql_function_registry.h>
- #include <yql/essentials/minikql/mkql_program_builder.h>
- #include <yql/essentials/minikql/invoke_builtins/mkql_builtins.h>
- using namespace NYql::NUdf;
- using namespace NKikimr;
- struct TBlockTrimmerTestData {
- TBlockTrimmerTestData()
- : FunctionRegistry(NMiniKQL::CreateFunctionRegistry(NMiniKQL::CreateBuiltinRegistry()))
- , Alloc(__LOCATION__)
- , Env(Alloc)
- , PgmBuilder(Env, *FunctionRegistry)
- , MemInfo("Memory")
- , ArrowPool(GetYqlMemoryPool())
- {
- }
- TIntrusivePtr<NMiniKQL::IFunctionRegistry> FunctionRegistry;
- NMiniKQL::TScopedAlloc Alloc;
- NMiniKQL::TTypeEnvironment Env;
- NMiniKQL::TProgramBuilder PgmBuilder;
- NMiniKQL::TMemoryUsageInfo MemInfo;
- arrow::MemoryPool* const ArrowPool;
- };
- Y_UNIT_TEST_SUITE(TBlockTrimmerTest) {
- Y_UNIT_TEST(TestFixedSize) {
- TBlockTrimmerTestData data;
- const auto int64Type = data.PgmBuilder.NewDataType(NUdf::EDataSlot::Int64, false);
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(int64Type);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- constexpr auto testSize = NMiniKQL::MaxBlockSizeInBytes / sizeof(i64);
- constexpr auto sliceSize = 1024;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), int64Type, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), int64Type);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), int64Type, data.ArrowPool);
- for (size_t i = 0; i < testSize; i++) {
- builder->Add(TBlockItem(i));
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.Get<i64>(), rhs.Get<i64>(), "Expected the same data after trim");
- }
- }
- }
- Y_UNIT_TEST(TestString) {
- TBlockTrimmerTestData data;
- const auto stringType = data.PgmBuilder.NewDataType(NUdf::EDataSlot::String, false);
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(stringType);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- // To fit all strings into single block
- constexpr auto testSize = 512;
- constexpr auto sliceSize = 128;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), stringType, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), stringType);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), stringType, data.ArrowPool);
- std::string testString;
- testString.resize(testSize);
- for (size_t i = 0; i < testSize; i++) {
- testString[i] = static_cast<char>(i);
- if (i % 2) {
- builder->Add(TBlockItem(TStringRef(testString.data(), i + 1)));
- } else {
- // Empty string
- builder->Add(TBlockItem(TStringRef()));
- }
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.AsStringRef(), rhs.AsStringRef(), "Expected the same data after trim");
- }
- }
- }
- Y_UNIT_TEST(TestOptional) {
- TBlockTrimmerTestData data;
- const auto optionalInt64Type = data.PgmBuilder.NewDataType(NUdf::EDataSlot::Int64, true);
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(optionalInt64Type);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- constexpr auto testSize = NMiniKQL::MaxBlockSizeInBytes / sizeof(i64);
- constexpr auto sliceSize = 1024;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), optionalInt64Type, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), optionalInt64Type);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), optionalInt64Type, data.ArrowPool);
- for (size_t i = 0; i < testSize; i++) {
- if (i % 2) {
- builder->Add(TBlockItem());
- } else {
- builder->Add(TBlockItem(i));
- }
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- UNIT_ASSERT_VALUES_EQUAL_C(bool(lhs), bool(rhs), "Expected the same optionality after trim");
- if (lhs) {
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.Get<i64>(), rhs.Get<i64>(), "Expected the same data after trim");
- }
- }
- }
- }
- Y_UNIT_TEST(TestExternalOptional) {
- TBlockTrimmerTestData data;
- const auto doubleOptInt64Type = data.PgmBuilder.NewOptionalType(data.PgmBuilder.NewDataType(NUdf::EDataSlot::Int64, true));
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(doubleOptInt64Type);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- constexpr auto testSize = NMiniKQL::MaxBlockSizeInBytes / sizeof(i64);
- constexpr auto sliceSize = 1024;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), doubleOptInt64Type, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), doubleOptInt64Type);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), doubleOptInt64Type, data.ArrowPool);
- for (size_t i = 0; i < testSize; i++) {
- if (i % 2) {
- builder->Add(TBlockItem(i).MakeOptional());
- } else if (i % 4) {
- builder->Add(TBlockItem());
- } else {
- builder->Add(TBlockItem().MakeOptional());
- }
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- for (size_t i = 0; i < 2; i++) {
- UNIT_ASSERT_VALUES_EQUAL_C(bool(lhs), bool(rhs), "Expected the same optionality after trim");
- if (!lhs) {
- break;
- }
- lhs = lhs.GetOptionalValue();
- rhs = rhs.GetOptionalValue();
- }
- if (lhs) {
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.Get<i64>(), rhs.Get<i64>(), "Expected the same data after trim");
- }
- }
- }
- }
- Y_UNIT_TEST(TestTuple) {
- TBlockTrimmerTestData data;
- std::vector<NMiniKQL::TType*> types;
- types.push_back(data.PgmBuilder.NewDataType(NUdf::EDataSlot::Int64));
- types.push_back(data.PgmBuilder.NewDataType(NUdf::EDataSlot::String));
- types.push_back(data.PgmBuilder.NewDataType(NUdf::EDataSlot::Int64, true));
- const auto tupleType = data.PgmBuilder.NewTupleType(types);
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(tupleType);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- // To fit all strings into single block
- constexpr auto testSize = 512;
- constexpr auto sliceSize = 128;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), tupleType, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), tupleType);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), tupleType, data.ArrowPool);
- std::string testString;
- testString.resize(testSize);
- std::vector<TBlockItem*> testTuples(testSize);
- for (size_t i = 0; i < testSize; i++) {
- testString[i] = static_cast<char>(i);
- TBlockItem* tupleItems = new TBlockItem[3];
- testTuples.push_back(tupleItems);
- tupleItems[0] = TBlockItem(i);
- tupleItems[1] = TBlockItem(TStringRef(testString.data(), i + 1));
- tupleItems[2] = i % 2 ? TBlockItem(i) : TBlockItem();
- builder->Add(TBlockItem(tupleItems));
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.GetElement(0).Get<i64>(), rhs.GetElement(0).Get<i64>(), "Expected the same data after trim");
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.GetElement(1).AsStringRef(), rhs.GetElement(1).AsStringRef(), "Expected the same data after trim");
- UNIT_ASSERT_VALUES_EQUAL_C(bool(lhs.GetElement(2)), bool(rhs.GetElement(2)), "Expected the same optionality after trim");
- if (bool(lhs.GetElement(2))) {
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.GetElement(2).Get<i64>(), rhs.GetElement(2).Get<i64>(), "Expected the same data after trim");
- }
- }
- }
- for (auto tupleItems : testTuples) {
- delete[] tupleItems;
- }
- }
- Y_UNIT_TEST(TestTzDate) {
- TBlockTrimmerTestData data;
- using TDtLayout = TDataType<TTzDatetime>::TLayout;
- const auto tzDatetimeType = data.PgmBuilder.NewDataType(NUdf::EDataSlot::TzDatetime, false);
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(tzDatetimeType);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- constexpr auto testSize = NMiniKQL::MaxBlockSizeInBytes / (sizeof(TDtLayout) + sizeof(ui16));
- constexpr auto sliceSize = 1024;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), tzDatetimeType, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), tzDatetimeType);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), tzDatetimeType, data.ArrowPool);
- for (size_t i = 0; i < testSize; i++) {
- TBlockItem dt = TBlockItem(i);
- dt.SetTimezoneId(i * 2);
- builder->Add(dt);
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.Get<TDtLayout>(), rhs.Get<TDtLayout>(), "Expected the same data after trim");
- UNIT_ASSERT_VALUES_EQUAL_C(lhs.GetTimezoneId(), rhs.GetTimezoneId(), "Expected the same data after trim");
- }
- }
- }
- extern const char ResourceName[] = "Resource.Name";
- Y_UNIT_TEST(TestResource) {
- TBlockTrimmerTestData data;
- const auto resourceType = data.PgmBuilder.NewResourceType(ResourceName);
- size_t itemSize = NMiniKQL::CalcMaxBlockItemSize(resourceType);
- size_t blockLen = NMiniKQL::CalcBlockLen(itemSize);
- Y_ENSURE(blockLen > 8);
- constexpr auto testSize = NMiniKQL::MaxBlockSizeInBytes / sizeof(TUnboxedValue);
- constexpr auto sliceSize = 1024;
- static_assert(testSize % sliceSize == 0);
- auto builder = MakeArrayBuilder(NMiniKQL::TTypeInfoHelper(), resourceType, *data.ArrowPool, blockLen, nullptr);
- auto reader = MakeBlockReader(NMiniKQL::TTypeInfoHelper(), resourceType);
- auto trimmer = MakeBlockTrimmer(NMiniKQL::TTypeInfoHelper(), resourceType, data.ArrowPool);
- struct TWithDtor {
- int Payload;
- std::shared_ptr<int> DestructorCallsCnt;
- TWithDtor(int payload, std::shared_ptr<int> destructorCallsCnt):
- Payload(payload), DestructorCallsCnt(std::move(destructorCallsCnt)) {
- }
- ~TWithDtor() {
- *DestructorCallsCnt = *DestructorCallsCnt + 1;
- }
- };
- using TTestResource = TBoxedResource<TWithDtor, ResourceName>;
- auto destructorCallsCnt = std::make_shared<int>(0);
- {
- for (size_t i = 0; i < testSize; i++) {
- builder->Add(TUnboxedValuePod(new TTestResource(i, destructorCallsCnt)));
- }
- auto datum = builder->Build(true);
- Y_ENSURE(datum.is_array());
- auto array = datum.array();
- for (size_t sliceIdx = 0; sliceIdx < testSize / sliceSize; sliceIdx++) {
- auto slice = Chop(array, sliceSize);
- auto trimmedSlice = trimmer->Trim(slice);
- for (size_t elemIdx = 0; elemIdx < sliceSize; elemIdx++) {
- TBlockItem lhs = reader->GetItem(*slice, elemIdx);
- TBlockItem rhs = reader->GetItem(*trimmedSlice, elemIdx);
- auto lhsResource = reinterpret_cast<TTestResource*>(lhs.GetBoxed().Get());
- auto rhsResource = reinterpret_cast<TTestResource*>(rhs.GetBoxed().Get());
- UNIT_ASSERT_VALUES_EQUAL_C(lhsResource->Get()->Payload, rhsResource->Get()->Payload, "Expected the same data after trim");
- }
- }
- }
- UNIT_ASSERT_VALUES_EQUAL_C(*destructorCallsCnt, testSize, "Expected 1 call to resource destructor");
- }
- }
|