topfreq.cpp 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #include "topfreq.h"
  2. #include <cmath>
  3. #include <algorithm>
  4. using namespace NKikimr;
  5. using namespace NUdf;
  6. template <typename THash, typename TEquals>
  7. TTopFreqBase<THash, TEquals>::TTopFreqBase(THash hash, TEquals equals)
  8. : Indices_(0, hash, equals)
  9. {}
  10. template <typename THash, typename TEquals>
  11. void TTopFreqBase<THash, TEquals>::Init(const TUnboxedValuePod& value, const ui32 minSize, const ui32 maxSize) {
  12. MinSize_ = minSize;
  13. MaxSize_ = maxSize;
  14. Freqs_.reserve(MaxSize_ + 1);
  15. Indices_.reserve(MaxSize_ + 1);
  16. AddValue(value);
  17. }
  18. template <typename THash, typename TEquals>
  19. void TTopFreqBase<THash, TEquals>::Merge(const TTopFreqBase& topFreq1, const TTopFreqBase& topFreq2) {
  20. MinSize_ = std::max(topFreq1.MinSize_, topFreq2.MinSize_);
  21. MaxSize_ = std::max(topFreq1.MaxSize_, topFreq2.MaxSize_);
  22. Freqs_.reserve(std::max(MaxSize_ + 1, ui32(topFreq1.Freqs_.size() + topFreq2.Freqs_.size())));
  23. Indices_.reserve(MaxSize_ + 1);
  24. Add(topFreq1);
  25. Add(topFreq2);
  26. }
  27. template <typename THash, typename TEquals>
  28. void TTopFreqBase<THash, TEquals>::Deserialize(const TUnboxedValuePod& serialized) {
  29. MinSize_ = serialized.GetElement(0).Get<ui32>();
  30. MaxSize_ = serialized.GetElement(1).Get<ui32>();
  31. Freqs_.reserve(MaxSize_ + 1);
  32. Indices_.reserve(MaxSize_ + 1);
  33. const auto listIter = serialized.GetElement(2).GetListIterator();
  34. for (TUnboxedValue current; listIter.Next(current);) {
  35. Update(current.GetElement(1), current.GetElement(0).Get<ui64>());
  36. }
  37. }
  38. template <typename THash, typename TEquals>
  39. TUnboxedValue TTopFreqBase<THash, TEquals>::Convert(const IValueBuilder* valueBuilder) const {
  40. TUnboxedValue* values = nullptr;
  41. const auto list = valueBuilder->NewArray(Freqs_.size(), values);
  42. for (const auto& item : Freqs_) {
  43. TUnboxedValue* items = nullptr;
  44. *values++ = valueBuilder->NewArray(2U, items);
  45. items[0] = TUnboxedValuePod(item.second);
  46. items[1] = item.first;
  47. }
  48. return list;
  49. }
  50. template <typename THash, typename TEquals>
  51. void TTopFreqBase<THash, TEquals>::Add(const TTopFreqBase& otherModeCalc) {
  52. for (auto& it : otherModeCalc.Freqs_) {
  53. Update(it.first, it.second);
  54. }
  55. TryCompress();
  56. }
  57. template <typename THash, typename TEquals>
  58. TUnboxedValue TTopFreqBase<THash, TEquals>::Get(const IValueBuilder* builder, ui32 resultSize) {
  59. resultSize = std::min(resultSize, ui32(Freqs_.size()));
  60. Compress(resultSize, true);
  61. return Convert(builder);
  62. }
  63. template <typename THash, typename TEquals>
  64. void TTopFreqBase<THash, TEquals>::AddValue(const TUnboxedValuePod& value) {
  65. Update(value, 1);
  66. TryCompress();
  67. }
  68. template <typename THash, typename TEquals>
  69. void TTopFreqBase<THash, TEquals>::Update(const TUnboxedValuePod& value, ui64 freq) {
  70. Freqs_.emplace_back(TUnboxedValuePod(value), freq);
  71. auto mapInsertResult = Indices_.emplace(TUnboxedValuePod(value), Freqs_.size() - 1);
  72. if (!mapInsertResult.second) {
  73. Freqs_[mapInsertResult.first->second].second += freq;
  74. Freqs_.pop_back();
  75. }
  76. }
  77. template <typename THash, typename TEquals>
  78. void TTopFreqBase<THash, TEquals>::TryCompress() {
  79. auto freqSize = Freqs_.size();
  80. if (freqSize > MaxSize_) {
  81. Compress(MinSize_);
  82. }
  83. }
  84. template <typename THash, typename TEquals>
  85. void TTopFreqBase<THash, TEquals>::Compress(ui32 newSize, bool sort) {
  86. auto compare = [](const TVectorElement& v1, const TVectorElement& v2) {
  87. return v1.second > v2.second;
  88. };
  89. if (sort) {
  90. std::sort(Freqs_.begin(), Freqs_.end(), compare);
  91. } else {
  92. std::nth_element(Freqs_.begin(), Freqs_.begin() + newSize - 1, Freqs_.end(), compare);
  93. }
  94. Indices_.clear();
  95. Freqs_.resize(newSize);
  96. for (ui32 i = 0; i < newSize; i++) {
  97. Indices_[Freqs_[i].first] = i;
  98. }
  99. }
  100. template <typename THash, typename TEquals>
  101. TUnboxedValue TTopFreqBase<THash, TEquals>::Serialize(const IValueBuilder* builder) {
  102. if (ui32(Freqs_.size()) > MinSize_) {
  103. Compress(MinSize_);
  104. }
  105. TUnboxedValue* items = nullptr;
  106. auto tuple = builder->NewArray(3U, items);
  107. items[0] = TUnboxedValuePod(MinSize_);
  108. items[1] = TUnboxedValuePod(MaxSize_);
  109. items[2] = Convert(builder);
  110. return tuple;
  111. }
  112. template <EDataSlot Slot>
  113. TTopFreqData<Slot>::TTopFreqData(const TUnboxedValuePod& value, const ui32 minSize, const ui32 maxSize)
  114. : TBase(TUnboxedValueHash<Slot>(), TUnboxedValueEquals<Slot>())
  115. {
  116. TBase::Init(value, minSize, maxSize);
  117. }
  118. template <EDataSlot Slot>
  119. TTopFreqData<Slot>::TTopFreqData(const TTopFreqData& topFreq1, const TTopFreqData& topFreq2)
  120. : TBase(TUnboxedValueHash<Slot>(), TUnboxedValueEquals<Slot>())
  121. {
  122. TBase::Merge(topFreq1, topFreq2);
  123. }
  124. template <EDataSlot Slot>
  125. TTopFreqData<Slot>::TTopFreqData(const TUnboxedValuePod& serialized)
  126. : TBase(TUnboxedValueHash<Slot>(), TUnboxedValueEquals<Slot>())
  127. {
  128. TBase::Deserialize(serialized);
  129. }
  130. template <EDataSlot Slot>
  131. TUnboxedValue TTopFreqData<Slot>::Serialize(const IValueBuilder* builder) {
  132. return TBase::Serialize(builder);
  133. }
  134. template <EDataSlot Slot>
  135. TUnboxedValue TTopFreqData<Slot>::Get(const IValueBuilder* builder, ui32 resultSize) {
  136. return TBase::Get(builder, resultSize);
  137. }
  138. template <EDataSlot Slot>
  139. void TTopFreqData<Slot>::AddValue(const TUnboxedValuePod& value) {
  140. TBase::AddValue(value);
  141. }
  142. #define INSTANCE_FOR(slot, ...) \
  143. template class TTopFreqData<EDataSlot::slot>;
  144. UDF_TYPE_ID_MAP(INSTANCE_FOR)
  145. #undef INSTANCE_FOR
  146. TTopFreqGeneric::TTopFreqGeneric(const TUnboxedValuePod& value, const ui32 minSize, const ui32 maxSize,
  147. IHash::TPtr hash, IEquate::TPtr equate)
  148. : TBase(TGenericHash{hash}, TGenericEquals{equate})
  149. {
  150. TBase::Init(value, minSize, maxSize);
  151. }
  152. TTopFreqGeneric::TTopFreqGeneric(const TTopFreqGeneric& topFreq1, const TTopFreqGeneric& topFreq2,
  153. IHash::TPtr hash, IEquate::TPtr equate)
  154. : TBase(TGenericHash{hash}, TGenericEquals{equate})
  155. {
  156. TBase::Merge(topFreq1, topFreq2);
  157. }
  158. TTopFreqGeneric::TTopFreqGeneric(const TUnboxedValuePod& serialized,
  159. IHash::TPtr hash, IEquate::TPtr equate)
  160. : TBase(TGenericHash{hash}, TGenericEquals{equate})
  161. {
  162. TBase::Deserialize(serialized);
  163. }
  164. TUnboxedValue TTopFreqGeneric::Serialize(const IValueBuilder* builder) {
  165. return TBase::Serialize(builder);
  166. }
  167. TUnboxedValue TTopFreqGeneric::Get(const IValueBuilder* builder, ui32 resultSize) {
  168. return TBase::Get(builder, resultSize);
  169. }
  170. void TTopFreqGeneric::AddValue(const TUnboxedValuePod& value) {
  171. TBase::AddValue(value);
  172. }