Browse Source

Row digests for store compactor

ifsmirnov 2 years ago
parent
commit
99cb30abce
2 changed files with 40 additions and 5 deletions
  1. 33 2
      library/cpp/tdigest/tdigest.cpp
  2. 7 3
      library/cpp/tdigest/tdigest.h

+ 33 - 2
library/cpp/tdigest/tdigest.cpp

@@ -19,11 +19,11 @@ TDigest::TDigest(double delta, double k, double firstValue)
     AddValue(firstValue);
 }
 
-TDigest::TDigest(const TString& serializedDigest)
+TDigest::TDigest(TStringBuf serializedDigest)
     : N(0)
 {
     NTDigest::TDigest digest;
-    Y_VERIFY(digest.ParseFromString(serializedDigest));
+    Y_VERIFY(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size()));
     Delta = digest.GetDelta();
     K = digest.GetK();
     for (int i = 0; i < digest.centroids_size(); ++i) {
@@ -157,6 +157,33 @@ double TDigest::GetPercentile(double percentile) {
     return Centroids.back().Mean;
 }
 
+double TDigest::GetRank(double value) {
+    Compress();
+    if (Centroids.empty()) {
+        return 0.0;
+    }
+    if (value < Centroids.front().Mean) {
+        return 0.0;
+    }
+    if (value == Centroids.front().Mean) {
+        return Centroids.front().Count * 0.5 / N;
+    }
+    double sum = 0.0;
+    double prev_x = 0.0;
+    double prev_mean = Centroids.front().Mean;
+    for (const auto& C : Centroids) {
+        double current_x = sum + C.Count * 0.5;
+        if (value <= C.Mean) {
+            double k = (value - prev_mean) / (C.Mean - prev_mean);
+            return (prev_x + k * (current_x - prev_x)) / N;
+        }
+        sum += C.Count;
+        prev_mean = C.Mean;
+        prev_x = current_x;
+    }
+    return 1.0;
+}
+
 TString TDigest::Serialize() {
     Compress();
     NTDigest::TDigest digest;
@@ -169,3 +196,7 @@ TString TDigest::Serialize() {
     }
     return digest.SerializeAsString();
 }
+
+i64 TDigest::GetCount() const {
+    return std::llround(N);
+}

+ 7 - 3
library/cpp/tdigest/tdigest.h

@@ -1,7 +1,6 @@
 #pragma once
 
-#include <util/generic/map.h>
-#include <util/generic/list.h>
+#include <util/generic/string.h>
 #include <util/generic/vector.h>
 
 class TDigest {
@@ -50,13 +49,18 @@ protected:
 public:
     TDigest(double delta = 0.01, double k = 25);
     TDigest(double delta, double k, double firstValue);
-    TDigest(const TString& serializedDigest);
+    TDigest(TStringBuf serializedDigest);
     TDigest(const TDigest* digest1, const TDigest* digest2); // merge
+
     TString Serialize();
+
     TDigest operator+(const TDigest& other);
     TDigest& operator+=(const TDigest& other);
+
     void AddValue(double value);
     void Compress();
     void Clear();
     double GetPercentile(double percentile);
+    double GetRank(double value);
+    i64 GetCount() const;
 };