histogram_udf.cpp 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018
  1. #include <yql/essentials/public/udf/udf_helpers.h>
  2. #include <library/cpp/histogram/adaptive/adaptive_histogram.h>
  3. #include <library/cpp/histogram/adaptive/block_histogram.h>
  4. #include <util/string/printf.h>
  5. #include <util/stream/format.h>
  6. #include <cmath>
  7. using namespace NKikimr;
  8. using namespace NUdf;
  9. using namespace NKiwiAggr;
  10. namespace {
  11. #define REGISTER_METHOD_UDF(name) \
  12. T##name,
  13. #define HISTOGRAM_ONE_DOUBLE_ARG_METHODS_MAP(XX) \
  14. XX(GetSumAboveBound) \
  15. XX(GetSumBelowBound) \
  16. XX(CalcUpperBound) \
  17. XX(CalcLowerBound) \
  18. XX(CalcUpperBoundSafe) \
  19. XX(CalcLowerBoundSafe)
  20. #define HISTOGRAM_TWO_DOUBLE_ARG_METHODS_MAP(XX) \
  21. XX(GetSumInRange)
  22. #define HISTOGRAM_ALGORITHMS_MAP(XX) \
  23. XX(AdaptiveDistance) \
  24. XX(AdaptiveWeight) \
  25. XX(AdaptiveWard) \
  26. XX(BlockWeight) \
  27. XX(BlockWard)
  28. #define HISTOGRAM_FUNCTION_MAP(XX, arg) \
  29. XX(Create, arg) \
  30. XX(AddValue, arg) \
  31. XX(GetResult, arg) \
  32. XX(Serialize, arg) \
  33. XX(Deserialize, arg) \
  34. XX(Merge, arg)
  35. #define DECLARE_HISTOGRAM_RESOURCE_NAME(name) extern const char name##HistogramResourceName[] = "Histogram." #name;
  36. HISTOGRAM_ALGORITHMS_MAP(DECLARE_HISTOGRAM_RESOURCE_NAME)
  37. DECLARE_HISTOGRAM_RESOURCE_NAME(Linear)
  38. DECLARE_HISTOGRAM_RESOURCE_NAME(Logarithmic)
  39. class TLinearHistogram: public TAdaptiveWardHistogram {
  40. public:
  41. TLinearHistogram(double step, double begin, double end)
  42. : TAdaptiveWardHistogram(1ULL << 24)
  43. , Step(step)
  44. , Begin(begin)
  45. , End(end)
  46. {
  47. }
  48. void Add(double value, double weight) override {
  49. if (value < Begin) {
  50. value = Begin;
  51. } else if (value > End) {
  52. value = End;
  53. } else {
  54. value = std::floor(value / Step + 0.5) * Step;
  55. }
  56. TAdaptiveWardHistogram::Add(value, weight);
  57. }
  58. void Add(const THistoRec&) override {
  59. Y_ABORT("Not implemented");
  60. }
  61. protected:
  62. double Step;
  63. double Begin;
  64. double End;
  65. };
  66. class TLogarithmicHistogram: public TLinearHistogram {
  67. public:
  68. TLogarithmicHistogram(double step, double begin, double end)
  69. : TLinearHistogram(step, begin, end)
  70. {
  71. }
  72. void Add(double value, double weight) override {
  73. double base = std::log(value) / std::log(Step);
  74. double prev = std::pow(Step, std::floor(base));
  75. double next = std::pow(Step, std::ceil(base));
  76. if (std::abs(value - next) > std::abs(value - prev)) {
  77. value = prev;
  78. } else {
  79. value = next;
  80. }
  81. if (value < Begin) {
  82. value = Begin;
  83. } else if (value > End) {
  84. value = End;
  85. }
  86. if (!std::isnan(value)) {
  87. TAdaptiveWardHistogram::Add(value, weight);
  88. }
  89. }
  90. void Add(const THistoRec&) override {
  91. Y_ABORT("Not implemented");
  92. }
  93. };
  94. template <typename THistogramType, const char* ResourceName>
  95. class THistogram_Create: public TBoxedValue {
  96. public:
  97. THistogram_Create(TSourcePosition pos)
  98. : Pos_(pos)
  99. {}
  100. typedef TBoxedResource<THistogramType, ResourceName> THistogramResource;
  101. static const TStringRef& Name() {
  102. static auto name = TString(ResourceName).substr(10) + "Histogram_Create";
  103. static auto nameRef = TStringRef(name);
  104. return nameRef;
  105. }
  106. private:
  107. TUnboxedValue Run(
  108. const IValueBuilder* valueBuilder,
  109. const TUnboxedValuePod* args) const override {
  110. try {
  111. Y_UNUSED(valueBuilder);
  112. THolder<THistogramResource> histogram(new THistogramResource(args[2].Get<ui32>()));
  113. histogram->Get()->Add(args[0].Get<double>(), args[1].Get<double>());
  114. return TUnboxedValuePod(histogram.Release());
  115. } catch (const std::exception& e) {
  116. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  117. }
  118. }
  119. public:
  120. static bool DeclareSignature(
  121. const TStringRef& name,
  122. TType* userType,
  123. IFunctionTypeInfoBuilder& builder,
  124. bool typesOnly) {
  125. Y_UNUSED(userType);
  126. if (Name() == name) {
  127. builder.SimpleSignature<TResource<ResourceName>(double, double, ui32)>();
  128. if (!typesOnly) {
  129. builder.Implementation(new THistogram_Create<THistogramType, ResourceName>(builder.GetSourcePosition()));
  130. }
  131. return true;
  132. } else {
  133. return false;
  134. }
  135. }
  136. private:
  137. TSourcePosition Pos_;
  138. };
  139. template <typename THistogramType, const char* ResourceName>
  140. class THistogram_AddValue: public TBoxedValue {
  141. public:
  142. THistogram_AddValue(TSourcePosition pos)
  143. : Pos_(pos)
  144. {}
  145. typedef TBoxedResource<THistogramType, ResourceName> THistogramResource;
  146. static const TStringRef& Name() {
  147. static auto name = TString(ResourceName).substr(10) + "Histogram_AddValue";
  148. static auto nameRef = TStringRef(name);
  149. return nameRef;
  150. }
  151. private:
  152. TUnboxedValue Run(
  153. const IValueBuilder* valueBuilder,
  154. const TUnboxedValuePod* args) const override {
  155. try {
  156. Y_UNUSED(valueBuilder);
  157. THistogramResource* resource = static_cast<THistogramResource*>(args[0].AsBoxed().Get());
  158. resource->Get()->Add(args[1].Get<double>(), args[2].Get<double>());
  159. return TUnboxedValuePod(args[0]);
  160. } catch (const std::exception& e) {
  161. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  162. }
  163. }
  164. public:
  165. static bool DeclareSignature(
  166. const TStringRef& name,
  167. TType* userType,
  168. IFunctionTypeInfoBuilder& builder,
  169. bool typesOnly) {
  170. Y_UNUSED(userType);
  171. if (Name() == name) {
  172. builder.SimpleSignature<TResource<ResourceName>(TResource<ResourceName>, double, double)>();
  173. if (!typesOnly) {
  174. builder.Implementation(new THistogram_AddValue<THistogramType, ResourceName>(builder.GetSourcePosition()));
  175. }
  176. return true;
  177. } else {
  178. return false;
  179. }
  180. }
  181. private:
  182. TSourcePosition Pos_;
  183. };
  184. template <typename THistogramType, const char* ResourceName>
  185. class THistogram_Serialize: public TBoxedValue {
  186. public:
  187. THistogram_Serialize(TSourcePosition pos)
  188. : Pos_(pos)
  189. {}
  190. typedef TBoxedResource<THistogramType, ResourceName> THistogramResource;
  191. static const TStringRef& Name() {
  192. static auto name = TString(ResourceName).substr(10) + "Histogram_Serialize";
  193. static auto nameRef = TStringRef(name);
  194. return nameRef;
  195. }
  196. private:
  197. TUnboxedValue Run(
  198. const IValueBuilder* valueBuilder,
  199. const TUnboxedValuePod* args) const override {
  200. try {
  201. THistogram proto;
  202. TString result;
  203. static_cast<THistogramResource*>(args[0].AsBoxed().Get())->Get()->ToProto(proto);
  204. Y_PROTOBUF_SUPPRESS_NODISCARD proto.SerializeToString(&result);
  205. return valueBuilder->NewString(result);
  206. } catch (const std::exception& e) {
  207. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  208. }
  209. }
  210. public:
  211. static bool DeclareSignature(
  212. const TStringRef& name,
  213. TType* userType,
  214. IFunctionTypeInfoBuilder& builder,
  215. bool typesOnly) {
  216. Y_UNUSED(userType);
  217. if (Name() == name) {
  218. builder.SimpleSignature<char*(TResource<ResourceName>)>();
  219. if (!typesOnly) {
  220. builder.Implementation(new THistogram_Serialize<THistogramType, ResourceName>(builder.GetSourcePosition()));
  221. }
  222. return true;
  223. } else {
  224. return false;
  225. }
  226. }
  227. private:
  228. TSourcePosition Pos_;
  229. };
  230. template <typename THistogramType, const char* ResourceName>
  231. class THistogram_Deserialize: public TBoxedValue {
  232. public:
  233. THistogram_Deserialize(TSourcePosition pos)
  234. : Pos_(pos)
  235. {}
  236. typedef TBoxedResource<THistogramType, ResourceName> THistogramResource;
  237. static const TStringRef& Name() {
  238. static auto name = TString(ResourceName).substr(10) + "Histogram_Deserialize";
  239. static auto nameRef = TStringRef(name);
  240. return nameRef;
  241. }
  242. private:
  243. TUnboxedValue Run(
  244. const IValueBuilder* valueBuilder,
  245. const TUnboxedValuePod* args) const override {
  246. try {
  247. Y_UNUSED(valueBuilder);
  248. THistogram proto;
  249. Y_PROTOBUF_SUPPRESS_NODISCARD proto.ParseFromString(TString(args[0].AsStringRef()));
  250. THolder<THistogramResource> histogram(new THistogramResource(args[1].Get<ui32>()));
  251. histogram->Get()->FromProto(proto);
  252. return TUnboxedValuePod(histogram.Release());
  253. } catch (const std::exception& e) {
  254. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  255. }
  256. }
  257. public:
  258. static bool DeclareSignature(
  259. const TStringRef& name,
  260. TType* userType,
  261. IFunctionTypeInfoBuilder& builder,
  262. bool typesOnly) {
  263. Y_UNUSED(userType);
  264. if (Name() == name) {
  265. builder.SimpleSignature<TResource<ResourceName>(char*, ui32)>();
  266. if (!typesOnly) {
  267. builder.Implementation(new THistogram_Deserialize<THistogramType, ResourceName>(builder.GetSourcePosition()));
  268. }
  269. return true;
  270. } else {
  271. return false;
  272. }
  273. }
  274. private:
  275. TSourcePosition Pos_;
  276. };
  277. template <typename THistogramType, const char* ResourceName>
  278. class THistogram_Merge: public TBoxedValue {
  279. public:
  280. THistogram_Merge(TSourcePosition pos)
  281. : Pos_(pos)
  282. {}
  283. typedef TBoxedResource<THistogramType, ResourceName> THistogramResource;
  284. static const TStringRef& Name() {
  285. static auto name = TString(ResourceName).substr(10) + "Histogram_Merge";
  286. static auto nameRef = TStringRef(name);
  287. return nameRef;
  288. }
  289. private:
  290. TUnboxedValue Run(
  291. const IValueBuilder* valueBuilder,
  292. const TUnboxedValuePod* args) const override {
  293. try {
  294. Y_UNUSED(valueBuilder);
  295. THistogram proto;
  296. static_cast<THistogramResource*>(args[0].AsBoxed().Get())->Get()->ToProto(proto);
  297. static_cast<THistogramResource*>(args[1].AsBoxed().Get())->Get()->Merge(proto, 1.0);
  298. return TUnboxedValuePod(args[1]);
  299. } catch (const std::exception& e) {
  300. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  301. }
  302. }
  303. public:
  304. static bool DeclareSignature(
  305. const TStringRef& name,
  306. TType* userType,
  307. IFunctionTypeInfoBuilder& builder,
  308. bool typesOnly) {
  309. Y_UNUSED(userType);
  310. if (Name() == name) {
  311. builder.SimpleSignature<TResource<ResourceName>(TResource<ResourceName>, TResource<ResourceName>)>();
  312. if (!typesOnly) {
  313. builder.Implementation(new THistogram_Merge<THistogramType, ResourceName>(builder.GetSourcePosition()));
  314. }
  315. return true;
  316. } else {
  317. return false;
  318. }
  319. }
  320. private:
  321. TSourcePosition Pos_;
  322. };
  323. struct THistogramIndexes {
  324. static constexpr ui32 BinFieldsCount = 2U;
  325. static constexpr ui32 ResultFieldsCount = 5U;
  326. THistogramIndexes(IFunctionTypeInfoBuilder& builder) {
  327. const auto binStructType = builder.Struct(BinFieldsCount)->AddField<double>("Position", &Position).AddField<double>("Frequency", &Frequency).Build();
  328. const auto binsList = builder.List()->Item(binStructType).Build();
  329. ResultStructType = builder.Struct(ResultFieldsCount)->AddField<char*>("Kind", &Kind).AddField<double>("Min", &Min).AddField<double>("Max", &Max).AddField<double>("WeightsSum", &WeightsSum).AddField("Bins", binsList, &Bins).Build();
  330. }
  331. ui32 Kind;
  332. ui32 Min;
  333. ui32 Max;
  334. ui32 WeightsSum;
  335. ui32 Bins;
  336. ui32 Position;
  337. ui32 Frequency;
  338. TType* ResultStructType;
  339. };
  340. template <typename THistogramType, const char* ResourceName>
  341. class THistogram_GetResult: public TBoxedValue {
  342. public:
  343. typedef TBoxedResource<THistogramType, ResourceName> THistogramResource;
  344. THistogram_GetResult(const THistogramIndexes& histogramIndexes, TSourcePosition pos)
  345. : HistogramIndexes(histogramIndexes)
  346. , Pos_(pos)
  347. {
  348. }
  349. static const TStringRef& Name() {
  350. static auto name = TString(ResourceName).substr(10) + "Histogram_GetResult";
  351. static auto nameRef = TStringRef(name);
  352. return nameRef;
  353. }
  354. private:
  355. TUnboxedValue Run(
  356. const IValueBuilder* valueBuilder,
  357. const TUnboxedValuePod* args) const override {
  358. THistogram proto;
  359. auto histogram = static_cast<THistogramResource*>(args[0].AsBoxed().Get())->Get();
  360. histogram->ToProto(proto);
  361. auto size = proto.FreqSize();
  362. TUnboxedValue* fields = nullptr;
  363. auto result = valueBuilder->NewArray(HistogramIndexes.ResultFieldsCount, fields);
  364. fields[HistogramIndexes.Kind] = valueBuilder->NewString(TStringBuf(ResourceName).Skip(10));
  365. if (size) {
  366. TUnboxedValue* items = nullptr;
  367. fields[HistogramIndexes.Bins] = valueBuilder->NewArray(size, items);
  368. fields[HistogramIndexes.Min] = TUnboxedValuePod(static_cast<double>(histogram->GetMinValue()));
  369. fields[HistogramIndexes.Max] = TUnboxedValuePod(static_cast<double>(histogram->GetMaxValue()));
  370. fields[HistogramIndexes.WeightsSum] = TUnboxedValuePod(static_cast<double>(histogram->GetSum()));
  371. for (ui64 i = 0; i < size; ++i) {
  372. TUnboxedValue* binFields = nullptr;
  373. *items++ = valueBuilder->NewArray(HistogramIndexes.BinFieldsCount, binFields);
  374. binFields[HistogramIndexes.Frequency] = TUnboxedValuePod(static_cast<double>(proto.GetFreq(i)));
  375. binFields[HistogramIndexes.Position] = TUnboxedValuePod(static_cast<double>(proto.GetPosition(i)));
  376. }
  377. } else {
  378. fields[HistogramIndexes.Bins] = valueBuilder->NewEmptyList();
  379. fields[HistogramIndexes.Min] = TUnboxedValuePod(0.0);
  380. fields[HistogramIndexes.Max] = TUnboxedValuePod(0.0);
  381. fields[HistogramIndexes.WeightsSum] = TUnboxedValuePod(0.0);
  382. }
  383. return result;
  384. }
  385. public:
  386. static bool DeclareSignature(
  387. const TStringRef& name,
  388. TType* userType,
  389. IFunctionTypeInfoBuilder& builder,
  390. bool typesOnly) {
  391. Y_UNUSED(userType);
  392. if (Name() == name) {
  393. auto resource = builder.Resource(TStringRef(ResourceName, std::strlen(ResourceName)));
  394. THistogramIndexes histogramIndexes(builder);
  395. builder.Args()->Add(resource).Done().Returns(histogramIndexes.ResultStructType);
  396. if (!typesOnly) {
  397. builder.Implementation(new THistogram_GetResult<THistogramType, ResourceName>(histogramIndexes, builder.GetSourcePosition()));
  398. }
  399. return true;
  400. } else {
  401. return false;
  402. }
  403. }
  404. private:
  405. const THistogramIndexes HistogramIndexes;
  406. TSourcePosition Pos_;
  407. };
  408. template <>
  409. TUnboxedValue THistogram_Create<TLinearHistogram, LinearHistogramResourceName>::Run(
  410. const IValueBuilder* valueBuilder,
  411. const TUnboxedValuePod* args) const {
  412. using THistogramResource = THistogram_Create<TLinearHistogram, LinearHistogramResourceName>::THistogramResource;
  413. try {
  414. Y_UNUSED(valueBuilder);
  415. THolder<THistogramResource> histogram(new THistogramResource(
  416. args[1].Get<double>(), args[2].Get<double>(), args[3].Get<double>()));
  417. histogram->Get()->Add(args[0].Get<double>(), 1.0);
  418. return TUnboxedValuePod(histogram.Release());
  419. } catch (const std::exception& e) {
  420. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  421. }
  422. }
  423. template <>
  424. bool THistogram_Create<TLinearHistogram, LinearHistogramResourceName>::DeclareSignature(
  425. const TStringRef& name,
  426. TType* userType,
  427. IFunctionTypeInfoBuilder& builder,
  428. bool typesOnly) {
  429. Y_UNUSED(userType);
  430. if (Name() == name) {
  431. builder.SimpleSignature<TResource<LinearHistogramResourceName>(double, double, double, double)>();
  432. if (!typesOnly) {
  433. builder.Implementation(new THistogram_Create<TLinearHistogram, LinearHistogramResourceName>(builder.GetSourcePosition()));
  434. }
  435. return true;
  436. } else {
  437. return false;
  438. }
  439. }
  440. template <>
  441. TUnboxedValue THistogram_Deserialize<TLinearHistogram, LinearHistogramResourceName>::Run(
  442. const IValueBuilder* valueBuilder,
  443. const TUnboxedValuePod* args) const {
  444. using THistogramResource = THistogram_Deserialize<TLinearHistogram, LinearHistogramResourceName>::THistogramResource;
  445. try {
  446. Y_UNUSED(valueBuilder);
  447. THistogram proto;
  448. Y_PROTOBUF_SUPPRESS_NODISCARD proto.ParseFromString(TString(args[0].AsStringRef()));
  449. THolder<THistogramResource> histogram(
  450. new THistogramResource(args[1].Get<double>(), args[2].Get<double>(), args[3].Get<double>()));
  451. histogram->Get()->FromProto(proto);
  452. return TUnboxedValuePod(histogram.Release());
  453. } catch (const std::exception& e) {
  454. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  455. }
  456. }
  457. template <>
  458. bool THistogram_Deserialize<TLinearHistogram, LinearHistogramResourceName>::DeclareSignature(
  459. const TStringRef& name,
  460. TType* userType,
  461. IFunctionTypeInfoBuilder& builder,
  462. bool typesOnly) {
  463. Y_UNUSED(userType);
  464. if (Name() == name) {
  465. builder.SimpleSignature<TResource<LinearHistogramResourceName>(char*, double, double, double)>();
  466. if (!typesOnly) {
  467. builder.Implementation(new THistogram_Deserialize<TLinearHistogram, LinearHistogramResourceName>(builder.GetSourcePosition()));
  468. }
  469. return true;
  470. } else {
  471. return false;
  472. }
  473. }
  474. template <>
  475. TUnboxedValue THistogram_Create<TLogarithmicHistogram, LogarithmicHistogramResourceName>::Run(
  476. const IValueBuilder* valueBuilder,
  477. const TUnboxedValuePod* args) const {
  478. using THistogramResource = THistogram_Create<TLogarithmicHistogram, LogarithmicHistogramResourceName>::THistogramResource;
  479. try {
  480. Y_UNUSED(valueBuilder);
  481. THolder<THistogramResource> histogram(new THistogramResource(
  482. args[1].Get<double>(), args[2].Get<double>(), args[3].Get<double>()));
  483. histogram->Get()->Add(args[0].Get<double>(), 1.0);
  484. return TUnboxedValuePod(histogram.Release());
  485. } catch (const std::exception& e) {
  486. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  487. }
  488. }
  489. template <>
  490. bool THistogram_Create<TLogarithmicHistogram, LogarithmicHistogramResourceName>::DeclareSignature(
  491. const TStringRef& name,
  492. TType* userType,
  493. IFunctionTypeInfoBuilder& builder,
  494. bool typesOnly) {
  495. Y_UNUSED(userType);
  496. if (Name() == name) {
  497. builder.SimpleSignature<TResource<LogarithmicHistogramResourceName>(double, double, double, double)>();
  498. if (!typesOnly) {
  499. builder.Implementation(new THistogram_Create<TLogarithmicHistogram, LogarithmicHistogramResourceName>(builder.GetSourcePosition()));
  500. }
  501. return true;
  502. } else {
  503. return false;
  504. }
  505. }
  506. template <>
  507. TUnboxedValue THistogram_Deserialize<TLogarithmicHistogram, LogarithmicHistogramResourceName>::Run(
  508. const IValueBuilder* valueBuilder,
  509. const TUnboxedValuePod* args) const {
  510. using THistogramResource = THistogram_Deserialize<TLogarithmicHistogram, LogarithmicHistogramResourceName>::THistogramResource;
  511. try {
  512. Y_UNUSED(valueBuilder);
  513. THistogram proto;
  514. Y_PROTOBUF_SUPPRESS_NODISCARD proto.ParseFromString(TString(args[0].AsStringRef()));
  515. THolder<THistogramResource> histogram(
  516. new THistogramResource(args[1].Get<double>(), args[2].Get<double>(), args[3].Get<double>()));
  517. histogram->Get()->FromProto(proto);
  518. return TUnboxedValuePod(histogram.Release());
  519. } catch (const std::exception& e) {
  520. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  521. }
  522. }
  523. template <>
  524. bool THistogram_Deserialize<TLogarithmicHistogram, LogarithmicHistogramResourceName>::DeclareSignature(
  525. const TStringRef& name,
  526. TType* userType,
  527. IFunctionTypeInfoBuilder& builder,
  528. bool typesOnly) {
  529. Y_UNUSED(userType);
  530. if (Name() == name) {
  531. builder.SimpleSignature<TResource<LogarithmicHistogramResourceName>(char*, double, double, double)>();
  532. if (!typesOnly) {
  533. builder.Implementation(new THistogram_Deserialize<TLogarithmicHistogram, LogarithmicHistogramResourceName>(builder.GetSourcePosition()));
  534. }
  535. return true;
  536. } else {
  537. return false;
  538. }
  539. }
  540. class THistogramPrint: public TBoxedValue {
  541. public:
  542. THistogramPrint(const THistogramIndexes& histogramIndexes)
  543. : HistogramIndexes(histogramIndexes)
  544. {
  545. }
  546. static const TStringRef& Name() {
  547. static auto name = TStringRef::Of("Print");
  548. return name;
  549. }
  550. TUnboxedValue Run(
  551. const IValueBuilder* valueBuilder,
  552. const TUnboxedValuePod* args) const override {
  553. auto kind = args[0].GetElement(HistogramIndexes.Kind);
  554. auto bins = args[0].GetElement(HistogramIndexes.Bins);
  555. double min = args[0].GetElement(HistogramIndexes.Min).Get<double>();
  556. double max = args[0].GetElement(HistogramIndexes.Max).Get<double>();
  557. double weightsSum = args[0].GetElement(HistogramIndexes.WeightsSum).Get<double>();
  558. auto binsIterator = bins.GetListIterator();
  559. TStringBuilder result;
  560. result << "Kind: " << (TStringBuf)kind.AsStringRef() << ' ';
  561. result << Sprintf("Bins: %" PRIu64 " WeightsSum: %.3f Min: %.3f Max: %.3f",
  562. bins.GetListLength(), weightsSum, min, max);
  563. double maxFrequency = 0.0;
  564. size_t maxPositionLength = 0;
  565. size_t maxFrequencyLength = 0;
  566. const ui8 bars = args[1].GetOrDefault<ui8>(25);
  567. for (TUnboxedValue current; binsIterator.Next(current);) {
  568. if (bars) {
  569. double frequency = current.GetElement(HistogramIndexes.Frequency).Get<double>();
  570. if (frequency > maxFrequency) {
  571. maxFrequency = frequency;
  572. }
  573. }
  574. size_t positionLength = Sprintf("%.3f", current.GetElement(HistogramIndexes.Position).Get<double>()).length();
  575. size_t frequencyLength = Sprintf("%.3f", current.GetElement(HistogramIndexes.Frequency).Get<double>()).length();
  576. if (positionLength > maxPositionLength) {
  577. maxPositionLength = positionLength;
  578. }
  579. if (frequencyLength > maxFrequencyLength) {
  580. maxFrequencyLength = frequencyLength;
  581. }
  582. }
  583. binsIterator = bins.GetListIterator();
  584. for (TUnboxedValue current; binsIterator.Next(current);) {
  585. double position = current.GetElement(HistogramIndexes.Position).Get<double>();
  586. double frequency = current.GetElement(HistogramIndexes.Frequency).Get<double>();
  587. result << "\n";
  588. if (bars && maxFrequency > 0) {
  589. ui8 filledBars = static_cast<ui8>(bars * frequency / maxFrequency);
  590. for (ui8 i = 0; i < bars; ++i) {
  591. if (i < filledBars) {
  592. result << "█";
  593. } else {
  594. result << "░";
  595. }
  596. }
  597. }
  598. result << " P: " << LeftPad(Sprintf("%.3f", position), maxPositionLength);
  599. result << " F: " << LeftPad(Sprintf("%.3f", frequency), maxFrequencyLength);
  600. }
  601. return valueBuilder->NewString(result);
  602. }
  603. static bool DeclareSignature(
  604. const TStringRef& name,
  605. TType* userType,
  606. IFunctionTypeInfoBuilder& builder,
  607. bool typesOnly) {
  608. Y_UNUSED(userType);
  609. if (Name() == name) {
  610. THistogramIndexes histogramIndexes(builder);
  611. auto optionalUi8 = builder.Optional()->Item<ui8>().Build();
  612. builder.Args()->Add(histogramIndexes.ResultStructType).Flags(ICallablePayload::TArgumentFlags::AutoMap).Add(optionalUi8).Done().OptionalArgs(1).Returns<char*>();
  613. if (!typesOnly) {
  614. builder.Implementation(new THistogramPrint(histogramIndexes));
  615. }
  616. builder.IsStrict();
  617. return true;
  618. } else {
  619. return false;
  620. }
  621. }
  622. private:
  623. const THistogramIndexes HistogramIndexes;
  624. };
  625. class THistogramToCumulativeDistributionFunction: public TBoxedValue {
  626. public:
  627. THistogramToCumulativeDistributionFunction(const THistogramIndexes& histogramIndexes)
  628. : HistogramIndexes(histogramIndexes)
  629. {
  630. }
  631. static const TStringRef& Name() {
  632. static auto name = TStringRef::Of("ToCumulativeDistributionFunction");
  633. return name;
  634. }
  635. TUnboxedValue Run(
  636. const IValueBuilder* valueBuilder,
  637. const TUnboxedValuePod* args) const override {
  638. TUnboxedValue* fields = nullptr;
  639. auto result = valueBuilder->NewArray(HistogramIndexes.ResultFieldsCount, fields);
  640. auto bins = args[0].GetElement(HistogramIndexes.Bins);
  641. double minValue = args[0].GetElement(HistogramIndexes.Min).Get<double>();
  642. double maxValue = args[0].GetElement(HistogramIndexes.Max).Get<double>();
  643. double sum = 0.0;
  644. double weightsSum = 0.0;
  645. std::vector<TUnboxedValue> resultBins;
  646. if (bins.HasFastListLength())
  647. resultBins.reserve(bins.GetListLength());
  648. const auto binsIterator = bins.GetListIterator();
  649. for (TUnboxedValue current; binsIterator.Next(current);) {
  650. TUnboxedValue* binFields = nullptr;
  651. auto resultCurrent = valueBuilder->NewArray(HistogramIndexes.BinFieldsCount, binFields);
  652. const auto frequency = current.GetElement(HistogramIndexes.Frequency).Get<double>();
  653. sum += frequency;
  654. weightsSum += sum;
  655. binFields[HistogramIndexes.Frequency] = TUnboxedValuePod(sum);
  656. binFields[HistogramIndexes.Position] = current.GetElement(HistogramIndexes.Position);
  657. resultBins.emplace_back(std::move(resultCurrent));
  658. }
  659. auto kind = args[0].GetElement(HistogramIndexes.Kind);
  660. fields[HistogramIndexes.Kind] = valueBuilder->AppendString(kind, "Cdf");
  661. fields[HistogramIndexes.Bins] = valueBuilder->NewList(resultBins.data(), resultBins.size());
  662. fields[HistogramIndexes.Max] = TUnboxedValuePod(maxValue);
  663. fields[HistogramIndexes.Min] = TUnboxedValuePod(minValue);
  664. fields[HistogramIndexes.WeightsSum] = TUnboxedValuePod(weightsSum);
  665. return result;
  666. }
  667. static bool DeclareSignature(
  668. const TStringRef& name,
  669. TType* userType,
  670. IFunctionTypeInfoBuilder& builder,
  671. bool typesOnly) {
  672. Y_UNUSED(userType);
  673. if (Name() == name) {
  674. THistogramIndexes histogramIndexes(builder);
  675. builder.Args()->Add(histogramIndexes.ResultStructType).Flags(ICallablePayload::TArgumentFlags::AutoMap).Done().Returns(histogramIndexes.ResultStructType);
  676. if (!typesOnly) {
  677. builder.Implementation(new THistogramToCumulativeDistributionFunction(histogramIndexes));
  678. }
  679. builder.IsStrict();
  680. return true;
  681. } else {
  682. return false;
  683. }
  684. }
  685. private:
  686. const THistogramIndexes HistogramIndexes;
  687. };
  688. class THistogramNormalize: public TBoxedValue {
  689. public:
  690. THistogramNormalize(const THistogramIndexes& histogramIndexes)
  691. : HistogramIndexes(histogramIndexes)
  692. {
  693. }
  694. static const TStringRef& Name() {
  695. static auto name = TStringRef::Of("Normalize");
  696. return name;
  697. }
  698. TUnboxedValue Run(
  699. const IValueBuilder* valueBuilder,
  700. const TUnboxedValuePod* args) const override {
  701. TUnboxedValue* fields = nullptr;
  702. auto result = valueBuilder->NewArray(HistogramIndexes.ResultFieldsCount, fields);
  703. auto bins = args[0].GetElement(HistogramIndexes.Bins);
  704. double minValue = args[0].GetElement(HistogramIndexes.Min).Get<double>();
  705. double maxValue = args[0].GetElement(HistogramIndexes.Max).Get<double>();
  706. double area = args[1].GetOrDefault<double>(100.0);
  707. bool cdfNormalization = args[2].GetOrDefault<bool>(false);
  708. double sum = 0.0;
  709. double weightsSum = 0.0;
  710. double lastBinFrequency = 0.0;
  711. std::vector<TUnboxedValue> resultBins;
  712. if (bins.HasFastListLength())
  713. resultBins.reserve(bins.GetListLength());
  714. auto binsIterator = bins.GetListIterator();
  715. for (TUnboxedValue current; binsIterator.Next(current);) {
  716. sum += current.GetElement(HistogramIndexes.Frequency).Get<double>();
  717. lastBinFrequency = current.GetElement(HistogramIndexes.Frequency).Get<double>();
  718. }
  719. binsIterator = bins.GetListIterator();
  720. for (TUnboxedValue current; binsIterator.Next(current);) {
  721. TUnboxedValue* binFields = nullptr;
  722. auto resultCurrent = valueBuilder->NewArray(HistogramIndexes.BinFieldsCount, binFields);
  723. double frequency = current.GetElement(HistogramIndexes.Frequency).Get<double>();
  724. if (cdfNormalization) {
  725. frequency = area * frequency / lastBinFrequency;
  726. } else {
  727. frequency = area * frequency / sum;
  728. }
  729. weightsSum += frequency;
  730. binFields[HistogramIndexes.Frequency] = TUnboxedValuePod(frequency);
  731. binFields[HistogramIndexes.Position] = current.GetElement(HistogramIndexes.Position);
  732. resultBins.emplace_back(std::move(resultCurrent));
  733. }
  734. TUnboxedValue kind = args[0].GetElement(HistogramIndexes.Kind);
  735. if (cdfNormalization) {
  736. kind = valueBuilder->AppendString(kind, "Cdf");
  737. }
  738. fields[HistogramIndexes.Kind] = kind;
  739. fields[HistogramIndexes.Bins] = valueBuilder->NewList(resultBins.data(), resultBins.size());
  740. fields[HistogramIndexes.Max] = TUnboxedValuePod(maxValue);
  741. fields[HistogramIndexes.Min] = TUnboxedValuePod(minValue);
  742. fields[HistogramIndexes.WeightsSum] = TUnboxedValuePod(weightsSum);
  743. return result;
  744. }
  745. static bool DeclareSignature(
  746. const TStringRef& name,
  747. TType* userType,
  748. IFunctionTypeInfoBuilder& builder,
  749. bool typesOnly) {
  750. Y_UNUSED(userType);
  751. if (Name() == name) {
  752. THistogramIndexes histogramIndexes(builder);
  753. auto optionalDouble = builder.Optional()->Item<double>().Build();
  754. auto optionalCdfNormalization = builder.Optional()->Item<bool>().Build();
  755. builder.Args()->Add(histogramIndexes.ResultStructType).Flags(ICallablePayload::TArgumentFlags::AutoMap).Add(optionalDouble).Add(optionalCdfNormalization).Done().Returns(histogramIndexes.ResultStructType);
  756. builder.OptionalArgs(1);
  757. builder.OptionalArgs(2);
  758. if (!typesOnly) {
  759. builder.Implementation(new THistogramNormalize(histogramIndexes));
  760. }
  761. builder.IsStrict();
  762. return true;
  763. } else {
  764. return false;
  765. }
  766. }
  767. private:
  768. const THistogramIndexes HistogramIndexes;
  769. };
  770. template <bool twoArgs>
  771. class THistogramMethodBase: public TBoxedValue {
  772. public:
  773. THistogramMethodBase(const THistogramIndexes& histogramIndexes, TSourcePosition pos)
  774. : HistogramIndexes(histogramIndexes)
  775. , Pos_(pos)
  776. {
  777. }
  778. virtual TUnboxedValue GetResult(
  779. const THistogram& input,
  780. const TUnboxedValuePod* args) const = 0;
  781. TUnboxedValue Run(
  782. const IValueBuilder*,
  783. const TUnboxedValuePod* args) const override {
  784. try {
  785. auto bins = args[0].GetElement(HistogramIndexes.Bins);
  786. double min = args[0].GetElement(HistogramIndexes.Min).template Get<double>();
  787. double max = args[0].GetElement(HistogramIndexes.Max).template Get<double>();
  788. auto binsIterator = bins.GetListIterator();
  789. THistogram histogram;
  790. histogram.SetType(HT_ADAPTIVE_HISTOGRAM);
  791. histogram.SetMinValue(min);
  792. histogram.SetMaxValue(max);
  793. for (TUnboxedValue current; binsIterator.Next(current);) {
  794. double frequency = current.GetElement(HistogramIndexes.Frequency).template Get<double>();
  795. double position = current.GetElement(HistogramIndexes.Position).template Get<double>();
  796. histogram.AddFreq(frequency);
  797. histogram.AddPosition(position);
  798. }
  799. return GetResult(histogram, args);
  800. } catch (const std::exception& e) {
  801. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  802. }
  803. }
  804. static THistogramIndexes DeclareSignatureBase(IFunctionTypeInfoBuilder& builder) {
  805. THistogramIndexes histogramIndexes(builder);
  806. if (twoArgs) {
  807. builder.Args()->Add(histogramIndexes.ResultStructType).Flags(ICallablePayload::TArgumentFlags::AutoMap).Add<double>().Add<double>().Done().Returns<double>();
  808. } else {
  809. builder.Args()->Add(histogramIndexes.ResultStructType).Flags(ICallablePayload::TArgumentFlags::AutoMap).Add<double>().Done().Returns<double>();
  810. }
  811. return histogramIndexes;
  812. }
  813. protected:
  814. const THistogramIndexes HistogramIndexes;
  815. TSourcePosition Pos_;
  816. };
  817. #define DECLARE_ONE_DOUBLE_ARG_METHOD_UDF(name) \
  818. class T##name: public THistogramMethodBase<false> { \
  819. public: \
  820. T##name(const THistogramIndexes& histogramIndexes, TSourcePosition pos) \
  821. : THistogramMethodBase<false>(histogramIndexes, pos) { \
  822. } \
  823. static const TStringRef& Name() { \
  824. static auto name = TStringRef::Of(#name); \
  825. return name; \
  826. } \
  827. static bool DeclareSignature( \
  828. const TStringRef& name, \
  829. TType* userType, \
  830. IFunctionTypeInfoBuilder& builder, \
  831. bool typesOnly) { \
  832. Y_UNUSED(userType); \
  833. if (Name() == name) { \
  834. const auto& histogramIndexes = DeclareSignatureBase(builder); \
  835. if (!typesOnly) { \
  836. builder.Implementation(new T##name(histogramIndexes, \
  837. builder.GetSourcePosition())); \
  838. } \
  839. return true; \
  840. } else { \
  841. return false; \
  842. } \
  843. } \
  844. TUnboxedValue GetResult( \
  845. const THistogram& input, \
  846. const TUnboxedValuePod* args) const override { \
  847. TAdaptiveWardHistogram histo(input, input.FreqSize()); \
  848. double result = histo.name(args[1].Get<double>()); \
  849. return TUnboxedValuePod(result); \
  850. } \
  851. };
  852. #define DECLARE_TWO_DOUBLE_ARG_METHOD_UDF(name) \
  853. class T##name: public THistogramMethodBase<true> { \
  854. public: \
  855. T##name(const THistogramIndexes& histogramIndexes, TSourcePosition pos) \
  856. : THistogramMethodBase<true>(histogramIndexes, pos) { \
  857. } \
  858. static const TStringRef& Name() { \
  859. static auto name = TStringRef::Of(#name); \
  860. return name; \
  861. } \
  862. static bool DeclareSignature( \
  863. const TStringRef& name, \
  864. TType* userType, \
  865. IFunctionTypeInfoBuilder& builder, \
  866. bool typesOnly) { \
  867. Y_UNUSED(userType); \
  868. if (Name() == name) { \
  869. const auto& histogramIndexes = DeclareSignatureBase(builder); \
  870. if (!typesOnly) { \
  871. builder.Implementation(new T##name(histogramIndexes, \
  872. builder.GetSourcePosition())); \
  873. } \
  874. return true; \
  875. } else { \
  876. return false; \
  877. } \
  878. } \
  879. TUnboxedValue GetResult( \
  880. const THistogram& input, \
  881. const TUnboxedValuePod* args) const override { \
  882. TAdaptiveWardHistogram histo(input, input.FreqSize()); \
  883. double result = histo.name(args[1].Get<double>(), args[2].Get<double>()); \
  884. return TUnboxedValuePod(result); \
  885. } \
  886. };
  887. #define DECLARE_HISTOGRAM_UDF(functionName, histogramName) \
  888. THistogram_##functionName<T##histogramName##Histogram, histogramName##HistogramResourceName>,
  889. #define DECLARE_HISTOGRAM_UDFS(name) \
  890. HISTOGRAM_FUNCTION_MAP(DECLARE_HISTOGRAM_UDF, name)
  891. HISTOGRAM_ONE_DOUBLE_ARG_METHODS_MAP(DECLARE_ONE_DOUBLE_ARG_METHOD_UDF)
  892. HISTOGRAM_TWO_DOUBLE_ARG_METHODS_MAP(DECLARE_TWO_DOUBLE_ARG_METHOD_UDF)
  893. SIMPLE_MODULE(THistogramModule,
  894. HISTOGRAM_ALGORITHMS_MAP(DECLARE_HISTOGRAM_UDFS)
  895. HISTOGRAM_ONE_DOUBLE_ARG_METHODS_MAP(REGISTER_METHOD_UDF)
  896. HISTOGRAM_TWO_DOUBLE_ARG_METHODS_MAP(REGISTER_METHOD_UDF)
  897. DECLARE_HISTOGRAM_UDFS(Linear)
  898. DECLARE_HISTOGRAM_UDFS(Logarithmic)
  899. THistogramPrint,
  900. THistogramNormalize,
  901. THistogramToCumulativeDistributionFunction)
  902. }
  903. REGISTER_MODULES(THistogramModule)