fixed_bin_histogram.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. #include "fixed_bin_histogram.h"
  2. #include "auto_histogram.h"
  3. #include <library/cpp/histogram/adaptive/protos/histo.pb.h>
  4. #include <util/generic/algorithm.h>
  5. #include <util/generic/yexception.h>
  6. #include <util/generic/ymath.h>
  7. #include <util/string/printf.h>
  8. namespace NKiwiAggr {
  9. TFixedBinHistogram::TFixedBinHistogram(size_t intervals, ui64 id, size_t trainingSetSize)
  10. : TrainingSetSize(trainingSetSize)
  11. , IsInitialized(false)
  12. , IsEmpty(true)
  13. , Id(id)
  14. , Sum(0.0)
  15. , Freqs(0)
  16. , ReserveFreqs(0)
  17. , BinRange(0.0)
  18. , Intervals(intervals)
  19. , BaseIndex(intervals / 2)
  20. {
  21. }
  22. TFixedBinHistogram::TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize)
  23. : TrainingSetSize(trainingSetSize)
  24. , IsInitialized(false)
  25. , IsEmpty(true)
  26. , Id(defaultId)
  27. , Sum(0.0)
  28. , Freqs(0)
  29. , ReserveFreqs(0)
  30. , BinRange(0.0)
  31. , Intervals(defaultIntervals)
  32. , BaseIndex(defaultIntervals / 2)
  33. {
  34. FromProto(histo);
  35. }
  36. TFixedBinHistogram::TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize)
  37. : TrainingSetSize(trainingSetSize)
  38. , IsInitialized(false)
  39. , IsEmpty(true)
  40. , Id(defaultId)
  41. , Sum(0.0)
  42. , Freqs(0)
  43. , ReserveFreqs(0)
  44. , BinRange(0.0)
  45. , Intervals(defaultIntervals)
  46. , BaseIndex(defaultIntervals / 2)
  47. {
  48. TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histo);
  49. if (!fixedBinHisto) {
  50. FromIHistogram(histo);
  51. return;
  52. }
  53. fixedBinHisto->Initialize();
  54. TrainingSetSize = fixedBinHisto->TrainingSetSize;
  55. IsInitialized = fixedBinHisto->IsInitialized;
  56. IsEmpty = fixedBinHisto->IsEmpty;
  57. Id = fixedBinHisto->Id;
  58. MinValue = fixedBinHisto->MinValue;
  59. MaxValue = fixedBinHisto->MaxValue;
  60. Sum = fixedBinHisto->Sum;
  61. Freqs.assign(fixedBinHisto->Freqs.begin(), fixedBinHisto->Freqs.end());
  62. ReserveFreqs.assign(fixedBinHisto->ReserveFreqs.begin(), fixedBinHisto->ReserveFreqs.end());
  63. ReferencePoint = fixedBinHisto->ReferencePoint;
  64. BinRange = fixedBinHisto->BinRange;
  65. Intervals = fixedBinHisto->Intervals;
  66. FirstUsedBin = fixedBinHisto->FirstUsedBin;
  67. LastUsedBin = fixedBinHisto->LastUsedBin;
  68. BaseIndex = fixedBinHisto->BaseIndex;
  69. }
  70. void TFixedBinHistogram::Clear() {
  71. TrainingSet.Destroy();
  72. IsInitialized = false;
  73. IsEmpty = true;
  74. Sum = 0.0;
  75. Freqs.clear();
  76. ReserveFreqs.clear();
  77. BinRange = 0.0;
  78. }
  79. void TFixedBinHistogram::Add(const THistoRec& rec) {
  80. if (!rec.HasId() || rec.GetId() == Id) {
  81. Add(rec.GetValue(), rec.GetWeight());
  82. }
  83. }
  84. void TFixedBinHistogram::Add(double value, double weight) {
  85. if (!IsValidFloat(value) || !IsValidFloat(weight)) {
  86. ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight);
  87. }
  88. if (weight <= 0.0) {
  89. return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden
  90. }
  91. Sum += weight;
  92. if (!IsInitialized) {
  93. if (!TrainingSet) {
  94. TrainingSet.Reset(new TVector<TWeightedValue>(0));
  95. }
  96. TrainingSet->push_back(TWeightedValue(value, weight));
  97. if (TrainingSet->size() >= TrainingSetSize) {
  98. Initialize();
  99. }
  100. return;
  101. }
  102. i32 bin = CalcBin(value);
  103. if (bin < 0 || bin >= (i32)Freqs.size() || (BinRange == 0.0 && value != ReferencePoint)) {
  104. Shrink(Min(value, MinValue), Max(value, MaxValue));
  105. Freqs[CalcBin(value)] += weight;
  106. } else {
  107. MinValue = Min(value, MinValue);
  108. MaxValue = Max(value, MaxValue);
  109. FirstUsedBin = Min(FirstUsedBin, bin);
  110. LastUsedBin = Max(LastUsedBin, bin);
  111. Freqs[bin] += weight;
  112. }
  113. }
  114. void TFixedBinHistogram::Merge(const THistogram& /*histo*/, double /*multiplier*/) {
  115. ythrow yexception() << "Method is not implemented for TFixedBinHistogram";
  116. }
  117. void TFixedBinHistogram::Merge(const TVector<THistogram>& histogramsToMerge) {
  118. TVector<IHistogramPtr> parsedHistogramsToMerge;
  119. for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
  120. parsedHistogramsToMerge.push_back(IHistogramPtr(new TAutoHistogram(histogramsToMerge[i], Intervals, Id)));
  121. }
  122. Merge(parsedHistogramsToMerge);
  123. }
  124. void TFixedBinHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) {
  125. TVector<IHistogramPtr> histogramsToMergeRepacked(0);
  126. TVector<TFixedBinHistogram*> histograms(0);
  127. // put current histogram to the vector of histograms to merge and clear self
  128. if (!Empty()) {
  129. histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(this, Intervals, Id, TrainingSetSize)));
  130. histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get()));
  131. }
  132. Clear();
  133. for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
  134. if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) {
  135. continue;
  136. }
  137. TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histogramsToMerge[i].Get());
  138. if (fixedBinHisto) {
  139. fixedBinHisto->Initialize();
  140. histogramsToMergeRepacked.push_back(histogramsToMerge[i]);
  141. } else {
  142. histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(histogramsToMerge[i].Get(), Intervals, Id, TrainingSetSize))); // Convert histograms that are not of TFixedBinHistogram type
  143. }
  144. histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get()));
  145. }
  146. if (histograms.size() == 0) {
  147. return;
  148. }
  149. double minValue = histograms[0]->MinValue;
  150. double maxValue = histograms[0]->MaxValue;
  151. Sum = histograms[0]->Sum;
  152. for (size_t i = 1; i < histograms.size(); ++i) {
  153. minValue = Min(minValue, histograms[i]->MinValue);
  154. maxValue = Max(maxValue, histograms[i]->MaxValue);
  155. Sum += histograms[i]->Sum;
  156. }
  157. SetFrame(minValue, maxValue, true);
  158. if (BinRange == 0.0) {
  159. Freqs[BaseIndex] = Sum;
  160. return;
  161. }
  162. for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) {
  163. TFixedBinHistogram* histo = histograms[histoIndex];
  164. for (i32 bin = histo->FirstUsedBin; bin <= histo->LastUsedBin; ++bin) {
  165. double binStart = histo->BinStart(bin);
  166. double binEnd = histo->BinEnd(bin);
  167. double freq = histo->Freqs[bin];
  168. if (binStart == binEnd) {
  169. Freqs[CalcBin(binStart)] += freq;
  170. continue;
  171. }
  172. size_t firstCross = CalcBin(binStart);
  173. size_t lastCross = CalcBin(binEnd);
  174. for (size_t i = firstCross; i <= lastCross; ++i) {
  175. double mergedBinStart = BinStart(i);
  176. double mergedBinEnd = BinEnd(i);
  177. double crossStart = Max(mergedBinStart, binStart);
  178. double crossEnd = Min(mergedBinEnd, binEnd);
  179. if (binStart == binEnd) {
  180. }
  181. Freqs[i] += freq * (crossEnd - crossStart) / (binEnd - binStart);
  182. }
  183. }
  184. }
  185. }
  186. void TFixedBinHistogram::Multiply(double factor) {
  187. if (!IsValidFloat(factor) || factor <= 0) {
  188. ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor;
  189. }
  190. if (!IsInitialized) {
  191. Initialize();
  192. }
  193. Sum *= factor;
  194. for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
  195. Freqs[i] *= factor;
  196. }
  197. }
  198. void TFixedBinHistogram::FromProto(const THistogram& histo) {
  199. if (histo.HasType() && histo.GetType() != HT_FIXED_BIN_HISTOGRAM) {
  200. ythrow yexception() << "Attempt to parse TFixedBinHistogram from THistogram protobuf record of wrong type = " << (ui32)histo.GetType();
  201. }
  202. TrainingSet.Destroy();
  203. IsInitialized = false;
  204. Sum = 0.0;
  205. Id = histo.GetId();
  206. size_t intervals = histo.FreqSize();
  207. if (intervals == 0) {
  208. IsEmpty = true;
  209. return;
  210. }
  211. Intervals = intervals;
  212. TrainingSetSize = Intervals;
  213. BaseIndex = Intervals / 2;
  214. if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue()) || !IsValidFloat(histo.GetBinRange())) {
  215. ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f binrange %f", Id, histo.GetMinValue(), histo.GetMaxValue(), histo.GetBinRange());
  216. }
  217. double minValue = histo.GetMinValue();
  218. double binRange = histo.GetBinRange();
  219. double maxValue = histo.HasMaxValue() ? histo.GetMaxValue() : minValue + binRange * Intervals;
  220. SetFrame(minValue, maxValue, true);
  221. BinRange = binRange;
  222. for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
  223. Freqs[i] = histo.GetFreq(i - BaseIndex);
  224. if (!IsValidFloat(Freqs[i])) {
  225. ythrow yexception() << Sprintf("FromProto in histogram id %lu: bad value %f", Id, Freqs[i]);
  226. }
  227. Sum += Freqs[i];
  228. }
  229. }
  230. void TFixedBinHistogram::ToProto(THistogram& histo) {
  231. histo.Clear();
  232. if (!IsInitialized) {
  233. Initialize();
  234. }
  235. histo.SetType(HT_FIXED_BIN_HISTOGRAM);
  236. histo.SetId(Id);
  237. if (IsEmpty) {
  238. return;
  239. }
  240. if (FirstUsedBin < (i32)BaseIndex || (LastUsedBin - FirstUsedBin + 1) > (i32)Intervals) {
  241. Shrink(MinValue, MaxValue);
  242. }
  243. histo.SetMinValue(MinValue);
  244. histo.SetMaxValue(MaxValue);
  245. histo.SetBinRange(BinRange);
  246. for (ui32 i = BaseIndex; i < BaseIndex + Intervals; ++i) {
  247. histo.AddFreq(Freqs[i]);
  248. }
  249. }
  250. void TFixedBinHistogram::SetId(ui64 id) {
  251. Id = id;
  252. }
  253. ui64 TFixedBinHistogram::GetId() {
  254. return Id;
  255. }
  256. bool TFixedBinHistogram::Empty() {
  257. if (!IsInitialized) {
  258. Initialize();
  259. }
  260. return IsEmpty;
  261. }
  262. double TFixedBinHistogram::GetMinValue() {
  263. if (!IsInitialized) {
  264. Initialize();
  265. }
  266. return MinValue;
  267. }
  268. double TFixedBinHistogram::GetMaxValue() {
  269. if (!IsInitialized) {
  270. Initialize();
  271. }
  272. return MaxValue;
  273. }
  274. double TFixedBinHistogram::GetSum() {
  275. return Sum;
  276. }
  277. double TFixedBinHistogram::GetSumInRange(double leftBound, double rightBound) {
  278. if (!IsInitialized) {
  279. Initialize();
  280. }
  281. if (leftBound > rightBound) {
  282. return 0.0;
  283. }
  284. return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum;
  285. }
  286. double TFixedBinHistogram::GetSumAboveBound(double bound) {
  287. if (!IsInitialized) {
  288. Initialize();
  289. }
  290. if (IsEmpty) {
  291. return 0.0;
  292. }
  293. if (BinRange == 0.0) { // special case - all values added to histogram are the same
  294. return (bound <= ReferencePoint) ? Sum : 0.0;
  295. }
  296. i32 bin = CalcBin(bound);
  297. if (bin < FirstUsedBin) {
  298. return Sum;
  299. }
  300. if (bin > LastUsedBin) {
  301. return 0.0;
  302. }
  303. double binStart = BinStart(bin);
  304. double binEnd = BinEnd(bin);
  305. double result = (bound < binStart) ? Freqs[bin] : Freqs[bin] * (binEnd - bound) / (binEnd - binStart);
  306. for (i32 i = bin + 1; i <= LastUsedBin; ++i) {
  307. result += Freqs[i];
  308. }
  309. return result;
  310. }
  311. double TFixedBinHistogram::GetSumBelowBound(double bound) {
  312. if (!IsInitialized) {
  313. Initialize();
  314. }
  315. if (IsEmpty) {
  316. return 0.0;
  317. }
  318. if (BinRange == 0.0) { // special case - all values added to histogram are the same
  319. return (bound > ReferencePoint) ? Sum : 0.0;
  320. }
  321. i32 bin = CalcBin(bound);
  322. if (bin < FirstUsedBin) {
  323. return 0.0;
  324. }
  325. if (bin > LastUsedBin) {
  326. return Sum;
  327. }
  328. double binStart = BinStart(bin);
  329. double binEnd = BinEnd(bin);
  330. double result = (bound > binEnd) ? Freqs[bin] : Freqs[bin] * (bound - binStart) / (binEnd - binStart);
  331. for (i32 i = bin - 1; i >= FirstUsedBin; --i) {
  332. result += Freqs[i];
  333. }
  334. return result;
  335. }
  336. double TFixedBinHistogram::CalcUpperBound(double sum) {
  337. if (!IsInitialized) {
  338. Initialize();
  339. }
  340. if (sum == 0.0) {
  341. return MinValue;
  342. }
  343. if (IsEmpty) {
  344. return MaxValue;
  345. }
  346. i32 currentBin = FirstUsedBin;
  347. double gatheredSum = 0.0;
  348. while (gatheredSum < sum && currentBin <= LastUsedBin) {
  349. gatheredSum += Freqs[currentBin];
  350. ++currentBin;
  351. }
  352. --currentBin;
  353. if ((gatheredSum <= sum && currentBin == LastUsedBin) || (Freqs[currentBin] == 0)) {
  354. return MaxValue;
  355. }
  356. double binStart = BinStart(currentBin);
  357. double binEnd = BinEnd(currentBin);
  358. return binEnd - (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin];
  359. }
  360. double TFixedBinHistogram::CalcLowerBound(double sum) {
  361. if (!IsInitialized) {
  362. Initialize();
  363. }
  364. if (sum == 0.0) {
  365. return MaxValue;
  366. }
  367. if (IsEmpty) {
  368. return MinValue;
  369. }
  370. i32 currentBin = LastUsedBin;
  371. double gatheredSum = 0.0;
  372. while (gatheredSum < sum && currentBin >= FirstUsedBin) {
  373. gatheredSum += Freqs[currentBin];
  374. --currentBin;
  375. }
  376. ++currentBin;
  377. if ((gatheredSum <= sum && currentBin == FirstUsedBin) || (Freqs[currentBin] == 0)) {
  378. return MinValue;
  379. }
  380. double binStart = BinStart(currentBin);
  381. double binEnd = BinEnd(currentBin);
  382. return binStart + (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin];
  383. }
  384. double TFixedBinHistogram::CalcUpperBoundSafe(double sum) {
  385. if (!Empty()) {
  386. sum = Max(Freqs[FirstUsedBin], sum);
  387. }
  388. return CalcUpperBound(sum);
  389. }
  390. double TFixedBinHistogram::CalcLowerBoundSafe(double sum) {
  391. if (!Empty()) {
  392. sum = Max(Freqs[LastUsedBin], sum);
  393. }
  394. return CalcLowerBound(sum);
  395. }
  396. double TFixedBinHistogram::CalcBinRange(double referencePoint, double maxValue) {
  397. return (maxValue - referencePoint) / ((double)Intervals - 0.02);
  398. }
  399. void TFixedBinHistogram::SetFrame(double minValue, double maxValue, bool clear) {
  400. MinValue = minValue;
  401. MaxValue = maxValue;
  402. ReferencePoint = MinValue;
  403. BinRange = CalcBinRange(ReferencePoint, MaxValue);
  404. FirstUsedBin = BaseIndex;
  405. LastUsedBin = (BinRange == 0.0) ? BaseIndex : BaseIndex + Intervals - 1;
  406. if (clear) {
  407. Freqs.assign(2 * Intervals, 0.0);
  408. ReserveFreqs.assign(2 * Intervals, 0.0);
  409. IsEmpty = false;
  410. IsInitialized = true;
  411. }
  412. }
  413. void TFixedBinHistogram::FromIHistogram(IHistogram* histo) {
  414. if (!histo) {
  415. ythrow yexception() << "Attempt to create TFixedBinFistogram from a NULL pointer";
  416. }
  417. Id = histo->GetId();
  418. if (histo->Empty()) {
  419. IsInitialized = false;
  420. IsEmpty = true;
  421. return;
  422. }
  423. SetFrame(histo->GetMinValue(), histo->GetMaxValue(), true);
  424. Sum = histo->GetSum();
  425. if (BinRange == 0.0) {
  426. Freqs[BaseIndex] = Sum;
  427. return;
  428. }
  429. for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
  430. Freqs[i] = histo->GetSumInRange(BinStart(i), BinEnd(i));
  431. }
  432. return;
  433. }
  434. void TFixedBinHistogram::Initialize() {
  435. if (IsInitialized) {
  436. return;
  437. }
  438. if (!TrainingSet || TrainingSet->size() == 0) {
  439. IsEmpty = true;
  440. return;
  441. }
  442. SetFrame(MinElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first,
  443. MaxElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, true);
  444. for (TVector<TWeightedValue>::const_iterator it = TrainingSet->begin(); it != TrainingSet->end(); ++it) {
  445. Freqs[CalcBin(it->first)] += it->second;
  446. }
  447. TrainingSet.Destroy();
  448. }
  449. i32 TFixedBinHistogram::CalcBin(double value) {
  450. return (BinRange == 0.0) ? BaseIndex : static_cast<i32>(BaseIndex + (value - ReferencePoint) / BinRange);
  451. }
  452. double TFixedBinHistogram::CalcDensity(double value) {
  453. i32 bin = CalcBin(value);
  454. if (bin < 0 || bin >= (i32)Freqs.size() || BinRange == 0.0 || GetSum() == 0) {
  455. return 0.0;
  456. }
  457. return Freqs[bin] / GetSum() / BinRange;
  458. }
  459. double TFixedBinHistogram::BinStart(i32 i) {
  460. return Max(ReferencePoint + (i - BaseIndex) * BinRange, MinValue);
  461. }
  462. double TFixedBinHistogram::BinEnd(i32 i) {
  463. return Min(ReferencePoint + (i + 1 - BaseIndex) * BinRange, MaxValue);
  464. }
  465. void TFixedBinHistogram::Shrink(double newReferencePoint, double newMaxValue) {
  466. Y_ABORT_UNLESS(newReferencePoint < newMaxValue, "Invalid Shrink()");
  467. memset(&(ReserveFreqs[0]), 0, ReserveFreqs.size() * sizeof(double));
  468. double newBinRange = CalcBinRange(newReferencePoint, newMaxValue);
  469. for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
  470. double binStart = BinStart(i);
  471. double binEnd = BinEnd(i);
  472. double freq = Freqs[i];
  473. i32 firstCross = static_cast<i32>(BaseIndex + (binStart - newReferencePoint) / newBinRange);
  474. i32 lastCross = static_cast<i32>(BaseIndex + (binEnd - newReferencePoint) / newBinRange);
  475. for (i32 j = firstCross; j <= lastCross; ++j) {
  476. double newBinStart = newReferencePoint + (j - BaseIndex) * newBinRange;
  477. double newBinEnd = newReferencePoint + (j + 1 - BaseIndex) * newBinRange;
  478. double crossStart = Max(newBinStart, binStart);
  479. double crossEnd = Min(newBinEnd, binEnd);
  480. ReserveFreqs[j] += (binStart == binEnd) ? freq : freq * (crossEnd - crossStart) / (binEnd - binStart);
  481. }
  482. }
  483. Freqs.swap(ReserveFreqs);
  484. SetFrame(newReferencePoint, newMaxValue, false);
  485. }
  486. }