123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- #pragma once
- #include "histogram.h"
- #include "common.h"
- #include <library/cpp/histogram/adaptive/protos/histo.pb.h>
- #include <util/generic/ptr.h>
- #include <util/generic/vector.h>
- #include <utility>
- namespace NKiwiAggr {
- ///////////////////
- // TBlockHistogram
- ///////////////////
- /**
- * Contrary to adaptive histogram, block histogram doesn't rebuild bins
- * after the addition of each point. Instead, it accumulates points and in case the amount
- * of points overflows specified limits, it shrinks all the points at once to produce histogram
- * Indeed, there exist two limits and two shrinkage operations:
- * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit
- * (override FastGreedyShrink to set specific behaviour)
- * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the
- * actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor)
- * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization
- */
- class TBlockHistogram: private TNonCopyable, public IHistogram {
- protected:
- static const size_t SHRINK_MULTIPLIER = 2;
- static const size_t GREEDY_SHRINK_MULTIPLIER = 4;
- static const size_t DEFAULT_INTERVALS = 100;
- static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER);
- const EHistogramType Type;
- const TQualityFunction CalcQuality;
- size_t Intervals;
- size_t ShrinkSize;
- size_t PrevSize;
- ui64 Id;
- double Sum;
- double MinValue;
- double MaxValue;
- TVector<TWeightedValue> Bins;
- public:
- TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
- size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
- virtual ~TBlockHistogram() {
- }
- virtual void Clear();
- virtual void Add(double value, double weight);
- virtual void Add(const THistoRec& histoRec);
- virtual void Merge(const THistogram& histo, double multiplier);
- virtual void Merge(const TVector<THistogram>& histogramsToMerge);
- virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented
- virtual void Multiply(double factor);
- virtual void FromProto(const THistogram& histo);
- virtual void ToProto(THistogram& histo);
- virtual void SetId(ui64 id);
- virtual ui64 GetId();
- virtual bool Empty();
- virtual double GetMinValue();
- virtual double GetMaxValue();
- virtual double GetSum();
- // methods below are not implemented
- virtual double GetSumInRange(double leftBound, double rightBound);
- virtual double GetSumAboveBound(double bound);
- virtual double GetSumBelowBound(double bound);
- virtual double CalcUpperBound(double sum);
- virtual double CalcLowerBound(double sum);
- virtual double CalcUpperBoundSafe(double sum);
- virtual double CalcLowerBoundSafe(double sum);
- private:
- void SortBins();
- void UniquifyBins();
- void CorrectShrinkSize();
- void SortAndShrink(size_t intervals, bool final = false);
- void SlowShrink(size_t intervals);
- virtual void FastGreedyShrink(size_t intervals) = 0;
- };
- /////////////////////////
- // TBlockWeightHistogram
- /////////////////////////
- class TBlockWeightHistogram: public TBlockHistogram {
- public:
- TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
- virtual ~TBlockWeightHistogram() {
- }
- private:
- virtual void FastGreedyShrink(size_t intervals) final;
- };
- ///////////////////////
- // TBlockWardHistogram
- ///////////////////////
- class TBlockWardHistogram: public TBlockHistogram {
- public:
- TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
- virtual ~TBlockWardHistogram() {
- }
- private:
- using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers)
- using TCumulatives = TVector<TCumulative>;
- struct TSplitInfo {
- double profit;
- TCumulatives::const_iterator beg;
- TCumulatives::const_iterator mid;
- TCumulatives::const_iterator end;
- bool operator<(const TSplitInfo& other) const {
- return profit < other.profit;
- }
- };
- private:
- virtual void FastGreedyShrink(size_t intervals) final;
- static bool CalcSplitInfo(const TCumulatives::const_iterator beg,
- const TCumulatives::const_iterator end,
- TSplitInfo& splitInfo);
- };
- }
|