#include #include #include #include #include #include #include #include #include namespace NYql { using namespace NKikimr::NMiniKQL; namespace NUdf { extern NUdf::TUniquePtr CreateStatModule(); } Y_UNIT_TEST_SUITE(TUDFStatTest) { Y_UNIT_TEST(SimplePercentile) { auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone(); auto randomProvider = CreateDeterministicRandomProvider(1); auto timeProvider = CreateDeterministicTimeProvider(10000000); NUdf::TUniquePtr module = NUdf::CreateStatModule(); mutableFunctionRegistry->AddModule("", "Stat", std::move(module)); TScopedAlloc alloc(__LOCATION__); TTypeEnvironment env(alloc); TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry); auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create"); auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue"); auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile"); TRuntimeNode pgmDigest; { auto param1 = pgmBuilder.NewDataLiteral(0.0); TVector params = {param1}; pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params); } for (int n = 1; n < 10; n += 1) { auto param2 = pgmBuilder.NewDataLiteral((double)n); TVector params = {pgmDigest, param2}; pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params); } TRuntimeNode pgmReturn; { auto param2 = pgmBuilder.NewDataLiteral(0.9); TVector params = {pgmDigest, param2}; pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params); } TExploringNodeVisitor explorer; explorer.Walk(pgmReturn.GetNode(), env); TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(), NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi); auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts); auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider)); auto value = graph->GetValue(); UNIT_ASSERT_DOUBLES_EQUAL(value.Get(), 8.5, 0.001); } Y_UNIT_TEST(SimplePercentileSpecific) { auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone(); auto randomProvider = CreateDeterministicRandomProvider(1); auto timeProvider = CreateDeterministicTimeProvider(1); NUdf::TUniquePtr module = NUdf::CreateStatModule(); mutableFunctionRegistry->AddModule("", "Stat", std::move(module)); TScopedAlloc alloc(__LOCATION__); TTypeEnvironment env(alloc); TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry); auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create"); auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue"); auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile"); TRuntimeNode pgmDigest; { auto param1 = pgmBuilder.NewDataLiteral(75.0); TVector params = {param1}; pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params); } TVector vals = {800, 20, 150}; for (auto val : vals) { auto param2 = pgmBuilder.NewDataLiteral(val); TVector params = {pgmDigest, param2}; pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params); } TRuntimeNode pgmReturn; { auto param2 = pgmBuilder.NewDataLiteral(0.5); TVector params = {pgmDigest, param2}; pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params); } TExploringNodeVisitor explorer; explorer.Walk(pgmReturn.GetNode(), env); TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(), NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi); auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts); auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider)); auto value = graph->GetValue(); Cerr << value.Get() << Endl; //~ UNIT_ASSERT_DOUBLES_EQUAL(value.Get(), 9.0, 0.001); } Y_UNIT_TEST(SerializedPercentile) { auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone(); auto randomProvider = CreateDeterministicRandomProvider(1); auto timeProvider = CreateDeterministicTimeProvider(1); NUdf::TUniquePtr module = NUdf::CreateStatModule(); mutableFunctionRegistry->AddModule("", "Stat", std::move(module)); TScopedAlloc alloc(__LOCATION__); TTypeEnvironment env(alloc); TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry); auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create"); auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue"); auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile"); auto udfTDigest_Serialize = pgmBuilder.Udf("Stat.TDigest_Serialize"); auto udfTDigest_Deserialize = pgmBuilder.Udf("Stat.TDigest_Deserialize"); TRuntimeNode pgmDigest; { auto param1 = pgmBuilder.NewDataLiteral(0.0); TVector params = {param1}; pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params); } for (int n = 1; n < 10; n += 1) { auto param2 = pgmBuilder.NewDataLiteral((double)n); TVector params = {pgmDigest, param2}; pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params); } TRuntimeNode pgmSerializedData; { TVector params = {pgmDigest}; pgmSerializedData = pgmBuilder.Apply(udfTDigest_Serialize, params); } TRuntimeNode pgmDigest2; { TVector params = {pgmSerializedData}; pgmDigest2 = pgmBuilder.Apply(udfTDigest_Deserialize, params); } TRuntimeNode pgmReturn; { auto param2 = pgmBuilder.NewDataLiteral(0.9); TVector params = {pgmDigest2, param2}; pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params); } TExploringNodeVisitor explorer; explorer.Walk(pgmReturn.GetNode(), env); TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(), NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi); auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts); auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider)); auto value = graph->GetValue(); UNIT_ASSERT_DOUBLES_EQUAL(value.Get(), 8.5, 0.001); } Y_UNIT_TEST(SerializedMergedPercentile) { auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone(); auto randomProvider = CreateDeterministicRandomProvider(1); auto timeProvider = CreateDeterministicTimeProvider(1); NUdf::TUniquePtr module = NUdf::CreateStatModule(); mutableFunctionRegistry->AddModule("", "Stat", std::move(module)); TScopedAlloc alloc(__LOCATION__); TTypeEnvironment env(alloc); TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry); auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create"); auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue"); auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile"); auto udfTDigest_Serialize = pgmBuilder.Udf("Stat.TDigest_Serialize"); auto udfTDigest_Deserialize = pgmBuilder.Udf("Stat.TDigest_Deserialize"); auto udfTDigest_Merge = pgmBuilder.Udf("Stat.TDigest_Merge"); TVector pgmSerializedDataVector; for (int i = 0; i < 100; i += 10) { TRuntimeNode pgmDigest; { auto param1 = pgmBuilder.NewDataLiteral(double(i) / 10); TVector params = {param1}; pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params); } for (int n = i + 1; n < i + 10; n += 1) { auto param2 = pgmBuilder.NewDataLiteral(double(n) / 10); TVector params = {pgmDigest, param2}; pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params); } TRuntimeNode pgmSerializedData; { TVector params = {pgmDigest}; pgmSerializedData = pgmBuilder.Apply(udfTDigest_Serialize, params); } pgmSerializedDataVector.push_back(pgmSerializedData); } TRuntimeNode pgmDigest; for (size_t i = 0; i < pgmSerializedDataVector.size(); ++i) { TRuntimeNode pgmDigest2; { TVector params = {pgmSerializedDataVector[i]}; pgmDigest2 = pgmBuilder.Apply(udfTDigest_Deserialize, params); } if (!pgmDigest) { pgmDigest = pgmDigest2; } else { TVector params = {pgmDigest, pgmDigest2}; pgmDigest = pgmBuilder.Apply(udfTDigest_Merge, params); } } TRuntimeNode pgmReturn; { auto param2 = pgmBuilder.NewDataLiteral(0.9); TVector params = {pgmDigest, param2}; pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params); } TExploringNodeVisitor explorer; explorer.Walk(pgmReturn.GetNode(), env); TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(), NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi); auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts); auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider)); auto value = graph->GetValue(); UNIT_ASSERT_DOUBLES_EQUAL(value.Get(), 8.95, 0.001); } static double GetParetoRandomNumber(double a) { return 1 / pow(RandomNumber(), double(1) / a); } Y_UNIT_TEST(BigPercentile) { auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone(); auto randomProvider = CreateDeterministicRandomProvider(1); auto timeProvider = CreateDeterministicTimeProvider(1); NUdf::TUniquePtr module = NUdf::CreateStatModule(); mutableFunctionRegistry->AddModule("", "Stat", std::move(module)); TScopedAlloc alloc(__LOCATION__); TTypeEnvironment env(alloc); TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry); auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create"); auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue"); auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile"); const size_t NUMBERS = 100000; const double PERCENTILE = 0.99; const double THRESHOLD = 0.0004; // at q=0.99 threshold is 4*delta*0.0099 TVector randomNumbers1; TVector randomNumbers2; randomNumbers1.reserve(NUMBERS); randomNumbers2.reserve(NUMBERS); for (size_t n = 0; n < NUMBERS; ++n) { double randomNumber = GetParetoRandomNumber(10); randomNumbers1.push_back(randomNumber); randomNumbers2.push_back(pgmBuilder.NewDataLiteral(randomNumber)); } TRuntimeNode bigList = pgmBuilder.AsList(randomNumbers2); auto pgmDigest = pgmBuilder.Fold1(bigList, [&](TRuntimeNode item) { std::array args; args[0] = item; return pgmBuilder.Apply(udfTDigest_Create, args); }, [&](TRuntimeNode item, TRuntimeNode state) { std::array args; args[0] = state; args[1] = item; return pgmBuilder.Apply(udfTDigest_AddValue, args); }); TRuntimeNode pgmReturn = pgmBuilder.Map(pgmDigest, [&](TRuntimeNode item) { auto param2 = pgmBuilder.NewDataLiteral(PERCENTILE); std::array args; args[0] = item; args[1] = param2; return pgmBuilder.Apply(udfTDigest_GetPercentile, args); }); TExploringNodeVisitor explorer; explorer.Walk(pgmReturn.GetNode(), env); TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(), NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi); auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts); auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider)); auto value = graph->GetValue(); UNIT_ASSERT(value); double digestValue = value.Get(); std::sort(randomNumbers1.begin(), randomNumbers1.end()); // This gives us a 1-based index of the last value <= digestValue auto index = std::upper_bound(randomNumbers1.begin(), randomNumbers1.end(), digestValue) - randomNumbers1.begin(); // See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C double p = (index - 0.5) / double(randomNumbers1.size()); UNIT_ASSERT_DOUBLES_EQUAL(p, PERCENTILE, THRESHOLD); } Y_UNIT_TEST(CentroidPrecision) { auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone(); auto randomProvider = CreateDeterministicRandomProvider(1); auto timeProvider = CreateDeterministicTimeProvider(1); NUdf::TUniquePtr module = NUdf::CreateStatModule(); mutableFunctionRegistry->AddModule("", "Stat", std::move(module)); TScopedAlloc alloc(__LOCATION__); TTypeEnvironment env(alloc); TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry); auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create"); auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue"); auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile"); const size_t NUMBERS = 100000; const double PERCENTILE = 0.25; const double minValue = 1.0; const double maxValue = 100.0; const double majorityValue = 50.0; TVector numbers; numbers.reserve(NUMBERS); for (size_t n = 0; n < NUMBERS - 2; ++n) { numbers.push_back(pgmBuilder.NewDataLiteral(majorityValue)); } numbers.push_back(pgmBuilder.NewDataLiteral(minValue)); numbers.push_back(pgmBuilder.NewDataLiteral(maxValue)); TRuntimeNode bigList = pgmBuilder.AsList(numbers); auto pgmDigest = pgmBuilder.Fold1(bigList, [&](TRuntimeNode item) { std::array args; args[0] = item; return pgmBuilder.Apply(udfTDigest_Create, args); }, [&](TRuntimeNode item, TRuntimeNode state) { std::array args; args[0] = state; args[1] = item; return pgmBuilder.Apply(udfTDigest_AddValue, args); }); TRuntimeNode pgmReturn = pgmBuilder.Map(pgmDigest, [&](TRuntimeNode item) { auto param2 = pgmBuilder.NewDataLiteral(PERCENTILE); std::array args; args[0] = item; args[1] = param2; return pgmBuilder.Apply(udfTDigest_GetPercentile, args); }); TExploringNodeVisitor explorer; explorer.Walk(pgmReturn.GetNode(), env); TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(), NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi); auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts); auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider)); auto value = graph->GetValue(); UNIT_ASSERT(value); double digestValue = value.Get(); UNIT_ASSERT_EQUAL(digestValue, majorityValue); } } }