count_min_sketch.cpp 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #include "count_min_sketch.h"
  2. #include <util/system/compiler.h>
  3. #include <stdlib.h>
  4. namespace NKikimr {
  5. TCountMinSketch* TCountMinSketch::Create(ui64 width, ui64 depth) {
  6. auto size = StaticSize(width, depth);
  7. auto* data = ::malloc(size);
  8. auto* sketch = reinterpret_cast<TCountMinSketch*>(data);
  9. std::memset(sketch, 0, size);
  10. sketch->Width = width;
  11. sketch->Depth = depth;
  12. sketch->ElementCount = 0;
  13. return sketch;
  14. }
  15. TCountMinSketch* TCountMinSketch::FromString(const char* data, size_t size) {
  16. auto* from = reinterpret_cast<const TCountMinSketch*>(data);
  17. Y_ABORT_UNLESS(StaticSize(from->Width, from->Depth) == size);
  18. auto* dataDst = ::malloc(size);
  19. std::memcpy(dataDst, data, size);
  20. return reinterpret_cast<TCountMinSketch*>(dataDst);
  21. }
  22. void TCountMinSketch::operator delete(void* data) noexcept {
  23. ::free(data);
  24. }
  25. ui64 TCountMinSketch::Hash(const char* data, size_t size, size_t hashIndex) {
  26. // fnv1a
  27. ui64 hash = 14695981039346656037ULL + 31 * hashIndex;
  28. const unsigned char* ptr = (const unsigned char*)data;
  29. for (size_t i = 0; i < size; ++i, ++ptr) {
  30. hash = hash ^ (*ptr);
  31. hash = hash * 1099511628211ULL;
  32. }
  33. return hash;
  34. }
  35. void TCountMinSketch::Count(const char* data, size_t size) {
  36. ui32* start = Buckets();
  37. for (size_t d = 0; d < Depth; ++d, start += Width) {
  38. ui64 hash = Hash(data, size, d);
  39. ui32* bucket = start + hash % Width;
  40. if (Y_LIKELY(*bucket < std::numeric_limits<ui32>::max())) {
  41. ++*bucket;
  42. }
  43. }
  44. ++ElementCount;
  45. }
  46. ui32 TCountMinSketch::Probe(const char* data, size_t size) const {
  47. ui32 minValue = std::numeric_limits<ui32>::max();
  48. const ui32* start = Buckets();
  49. for (size_t d = 0; d < Depth; ++d, start += Width) {
  50. ui64 hash = Hash(data, size, d);
  51. const ui32* bucket = start + hash % Width;
  52. minValue = std::min(minValue, *bucket);
  53. }
  54. return minValue;
  55. }
  56. TCountMinSketch& TCountMinSketch::operator+=(const TCountMinSketch& rhs) {
  57. if (Width != rhs.Width || Depth != rhs.Depth) {
  58. return *this;
  59. }
  60. ui32* dst = Buckets();
  61. const ui32* src = rhs.Buckets();
  62. ui32* end = dst + Width * Depth;
  63. for (; dst != end; ++dst, ++src) {
  64. ui32 sum = *dst + *src;
  65. if (Y_UNLIKELY(sum < *dst)) {
  66. *dst = std::numeric_limits<ui32>::max();
  67. } else {
  68. *dst = sum;
  69. }
  70. }
  71. ElementCount += rhs.ElementCount;
  72. return *this;
  73. }
  74. } // namespace NKikimr