block_histogram.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #pragma once
  2. #include "histogram.h"
  3. #include "common.h"
  4. #include <library/cpp/histogram/adaptive/protos/histo.pb.h>
  5. #include <util/generic/ptr.h>
  6. #include <util/generic/vector.h>
  7. #include <utility>
  8. namespace NKiwiAggr {
  9. ///////////////////
  10. // TBlockHistogram
  11. ///////////////////
  12. /**
  13. * Contrary to adaptive histogram, block histogram doesn't rebuild bins
  14. * after the addition of each point. Instead, it accumulates points and in case the amount
  15. * of points overflows specified limits, it shrinks all the points at once to produce histogram
  16. * Indeed, there exist two limits and two shrinkage operations:
  17. * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit
  18. * (override FastGreedyShrink to set specific behaviour)
  19. * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the
  20. * actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor)
  21. * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization
  22. */
  23. class TBlockHistogram: private TNonCopyable, public IHistogram {
  24. protected:
  25. static const size_t SHRINK_MULTIPLIER = 2;
  26. static const size_t GREEDY_SHRINK_MULTIPLIER = 4;
  27. static const size_t DEFAULT_INTERVALS = 100;
  28. static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER);
  29. const EHistogramType Type;
  30. const TQualityFunction CalcQuality;
  31. size_t Intervals;
  32. size_t ShrinkSize;
  33. size_t PrevSize;
  34. ui64 Id;
  35. double Sum;
  36. double MinValue;
  37. double MaxValue;
  38. TVector<TWeightedValue> Bins;
  39. public:
  40. TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
  41. size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
  42. virtual ~TBlockHistogram() {
  43. }
  44. virtual void Clear();
  45. virtual void Add(double value, double weight);
  46. virtual void Add(const THistoRec& histoRec);
  47. virtual void Merge(const THistogram& histo, double multiplier);
  48. virtual void Merge(const TVector<THistogram>& histogramsToMerge);
  49. virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented
  50. virtual void Multiply(double factor);
  51. virtual void FromProto(const THistogram& histo);
  52. virtual void ToProto(THistogram& histo);
  53. virtual void SetId(ui64 id);
  54. virtual ui64 GetId();
  55. virtual bool Empty();
  56. virtual double GetMinValue();
  57. virtual double GetMaxValue();
  58. virtual double GetSum();
  59. // methods below are not implemented
  60. virtual double GetSumInRange(double leftBound, double rightBound);
  61. virtual double GetSumAboveBound(double bound);
  62. virtual double GetSumBelowBound(double bound);
  63. virtual double CalcUpperBound(double sum);
  64. virtual double CalcLowerBound(double sum);
  65. virtual double CalcUpperBoundSafe(double sum);
  66. virtual double CalcLowerBoundSafe(double sum);
  67. private:
  68. void SortBins();
  69. void UniquifyBins();
  70. void CorrectShrinkSize();
  71. void SortAndShrink(size_t intervals, bool final = false);
  72. void SlowShrink(size_t intervals);
  73. virtual void FastGreedyShrink(size_t intervals) = 0;
  74. };
  75. /////////////////////////
  76. // TBlockWeightHistogram
  77. /////////////////////////
  78. class TBlockWeightHistogram: public TBlockHistogram {
  79. public:
  80. TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
  81. virtual ~TBlockWeightHistogram() {
  82. }
  83. private:
  84. virtual void FastGreedyShrink(size_t intervals) final;
  85. };
  86. ///////////////////////
  87. // TBlockWardHistogram
  88. ///////////////////////
  89. class TBlockWardHistogram: public TBlockHistogram {
  90. public:
  91. TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
  92. virtual ~TBlockWardHistogram() {
  93. }
  94. private:
  95. using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers)
  96. using TCumulatives = TVector<TCumulative>;
  97. struct TSplitInfo {
  98. double profit;
  99. TCumulatives::const_iterator beg;
  100. TCumulatives::const_iterator mid;
  101. TCumulatives::const_iterator end;
  102. bool operator<(const TSplitInfo& other) const {
  103. return profit < other.profit;
  104. }
  105. };
  106. private:
  107. virtual void FastGreedyShrink(size_t intervals) final;
  108. static bool CalcSplitInfo(const TCumulatives::const_iterator beg,
  109. const TCumulatives::const_iterator end,
  110. TSplitInfo& splitInfo);
  111. };
  112. }