hyperloglog.cpp 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #include "hyperloglog.h"
  2. #include <util/generic/bitops.h>
  3. #include <util/generic/yexception.h>
  4. #include <util/stream/output.h>
  5. #include <algorithm>
  6. #include <array>
  7. #include <cmath>
  8. #include <functional>
  9. namespace {
  10. using TLookup = std::array<double, 256>;
  11. struct TCorrection {
  12. TLookup Estimations;
  13. TLookup Biases;
  14. double GetBias(double e) const {
  15. for (size_t idx = 0;; ++idx) {
  16. const auto estr = Estimations[idx];
  17. if (estr >= e) {
  18. if (idx == 0) {
  19. return Biases[0];
  20. }
  21. const auto estl = Estimations[idx - 1];
  22. const auto biasl = Biases[idx - 1];
  23. const auto biasr = Biases[idx];
  24. const auto de = estr - estl;
  25. const auto db = biasr - biasl;
  26. const auto scale = e - estl;
  27. return biasl + scale * db / de;
  28. } else if (std::fabs(estr) < 1e-4) {
  29. //limiter
  30. return Biases[idx - 1];
  31. }
  32. }
  33. }
  34. };
  35. double EstimateBias(double e, unsigned precision) {
  36. static const TCorrection CORRECTIONS[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = {
  37. #include "hyperloglog_corrections.inc"
  38. };
  39. if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) {
  40. return 0.;
  41. }
  42. return CORRECTIONS[precision - THyperLogLog::PRECISION_MIN].GetBias(e);
  43. }
  44. double GetThreshold(unsigned precision) {
  45. static const double THRESHOLD_DATA[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = {
  46. 10, // Precision 4
  47. 20, // Precision 5
  48. 40, // Precision 6
  49. 80, // Precision 7
  50. 220, // Precision 8
  51. 400, // Precision 9
  52. 900, // Precision 10
  53. 1800, // Precision 11
  54. 3100, // Precision 12
  55. 6500, // Precision 13
  56. 11500, // Precision 14
  57. 20000, // Precision 15
  58. 50000, // Precision 16
  59. 120000, // Precision 17
  60. 350000 // Precision 18
  61. };
  62. if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) {
  63. return 0.;
  64. }
  65. return THRESHOLD_DATA[precision - THyperLogLog::PRECISION_MIN];
  66. }
  67. double EmpiricAlpha(size_t m) {
  68. switch (m) {
  69. case 16:
  70. return 0.673;
  71. case 32:
  72. return 0.697;
  73. case 64:
  74. return 0.709;
  75. default:
  76. return 0.7213 / (1.0 + 1.079 / m);
  77. }
  78. }
  79. double RawEstimate(const ui8* counts, size_t size) {
  80. double sum = {};
  81. for (size_t i = 0; i < size; ++i) {
  82. sum += std::pow(2.0, -counts[i]);
  83. }
  84. return EmpiricAlpha(size) * size * size / sum;
  85. }
  86. double LinearCounting(size_t registers, size_t zeroed) {
  87. return std::log(double(registers) / zeroed) * registers;
  88. }
  89. }
  90. THyperLogLogBase::THyperLogLogBase(unsigned precision)
  91. : Precision(precision) {
  92. Y_ENSURE(precision >= PRECISION_MIN && precision <= PRECISION_MAX);
  93. }
  94. void THyperLogLogBase::Update(ui64 hash) {
  95. const unsigned subHashBits = 8 * sizeof(hash) - Precision;
  96. const auto subHash = hash & MaskLowerBits(subHashBits);
  97. const auto leadingZeroes = subHash ? (subHashBits - GetValueBitCount(subHash)) : subHashBits;
  98. const ui8 weight = static_cast<ui8>(leadingZeroes + 1);
  99. const size_t reg = static_cast<size_t>(hash >> subHashBits);
  100. RegistersRef[reg] = std::max(RegistersRef[reg], weight);
  101. }
  102. void THyperLogLogBase::Merge(const THyperLogLogBase& rh) {
  103. Y_ENSURE(Precision == rh.Precision);
  104. std::transform(RegistersRef.begin(), RegistersRef.end(), rh.RegistersRef.begin(), RegistersRef.begin(), [](ui8 l, ui8 r) { return std::max(l, r); });
  105. }
  106. ui64 THyperLogLogBase::Estimate() const {
  107. const auto m = RegistersRef.size();
  108. const auto e = RawEstimate(RegistersRef.data(), m);
  109. const auto e_ = e <= 5 * m ? (e - EstimateBias(e, Precision)) : e;
  110. const auto v = std::count(RegistersRef.begin(), RegistersRef.end(), ui8(0));
  111. const auto h = v != 0 ? LinearCounting(m, v) : e_;
  112. return h <= GetThreshold(Precision) ? h : e_;
  113. }
  114. void THyperLogLogBase::Save(IOutputStream& out) const {
  115. out.Write(static_cast<char>(Precision));
  116. out.Write(RegistersRef.data(), RegistersRef.size() * sizeof(RegistersRef.front()));
  117. }