pfor_codec.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. #pragma once
  2. #include "codecs.h"
  3. #include "delta_codec.h"
  4. #include "tls_cache.h"
  5. #include <library/cpp/bit_io/bitinput.h>
  6. #include <library/cpp/bit_io/bitoutput.h>
  7. #include <util/string/cast.h>
  8. namespace NCodecs {
  9. template <typename T, bool WithDelta = false>
  10. class TPForCodec: public ICodec {
  11. using TUnsigned = std::make_unsigned_t<T>;
  12. typedef TDeltaCodec<TUnsigned> TDCodec;
  13. typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue;
  14. static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value");
  15. static const ui64 BitsInT = sizeof(TUnsigned) * 8;
  16. TDCodec DeltaCodec;
  17. public:
  18. static TStringBuf MyName();
  19. TPForCodec() {
  20. MyTraits.AssumesStructuredInput = true;
  21. MyTraits.SizeOfInputElement = sizeof(T);
  22. MyTraits.SizeOnDecodeMultiplier = sizeof(T);
  23. }
  24. TString GetName() const override {
  25. return ToString(MyName());
  26. }
  27. ui8 Encode(TStringBuf s, TBuffer& b) const override {
  28. b.Clear();
  29. if (s.empty()) {
  30. return 0;
  31. }
  32. b.Reserve(2 * s.size() + b.Size());
  33. if (WithDelta) {
  34. auto buffer = TBufferTlsCache::TlsInstance().Item();
  35. TBuffer& db = buffer.Get();
  36. db.Clear();
  37. db.Reserve(2 * s.size());
  38. DeltaCodec.Encode(s, db);
  39. s = TStringBuf{db.data(), db.size()};
  40. }
  41. TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)};
  42. const ui64 sz = tin.size();
  43. ui64 bitcounts[BitsInT + 1];
  44. Zero(bitcounts);
  45. ui32 zeros = 0;
  46. for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
  47. TUnsigned v = 1 + (TUnsigned)*it;
  48. ui64 l = MostSignificantBit(v) + 1;
  49. ++bitcounts[l];
  50. if (!v) {
  51. ++zeros;
  52. }
  53. }
  54. // cumulative bit counts
  55. for (ui64 i = 0; i < BitsInT; ++i) {
  56. bitcounts[i + 1] += bitcounts[i];
  57. }
  58. bool hasexceptions = zeros;
  59. ui64 optimalbits = BitsInT;
  60. {
  61. ui64 excsize = 0;
  62. ui64 minsize = sz * BitsInT;
  63. for (ui64 current = BitsInT; current; --current) {
  64. ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6);
  65. excsize += current * bitcounts[current];
  66. if (size < minsize) {
  67. minsize = size;
  68. optimalbits = current;
  69. hasexceptions = zeros || sz - bitcounts[current];
  70. }
  71. }
  72. }
  73. if (!optimalbits || BitsInT == optimalbits) {
  74. b.Append((ui8)-1);
  75. b.Append(s.data(), s.size());
  76. return 0;
  77. } else {
  78. NBitIO::TBitOutputVector<TBuffer> bout(&b);
  79. bout.Write(0, 1);
  80. bout.Write(hasexceptions, 1);
  81. bout.Write(optimalbits, 6);
  82. for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
  83. TUnsigned word = 1 + (TUnsigned)*it;
  84. ui64 len = MostSignificantBit(word) + 1;
  85. if (len > optimalbits || !word) {
  86. Y_ENSURE(hasexceptions, " ");
  87. bout.Write(0, optimalbits);
  88. bout.Write(len, 6);
  89. bout.Write(word, len);
  90. } else {
  91. bout.Write(word, optimalbits);
  92. }
  93. }
  94. return bout.GetByteReminder();
  95. } // the rest of the last byte is zero padded. BitsInT is always > 7.
  96. }
  97. void Decode(TStringBuf s, TBuffer& b) const override {
  98. b.Clear();
  99. if (s.empty()) {
  100. return;
  101. }
  102. b.Reserve(s.size() * sizeof(T) + b.Size());
  103. ui64 isplain = 0;
  104. ui64 hasexceptions = 0;
  105. ui64 bits = 0;
  106. NBitIO::TBitInput bin(s);
  107. bin.ReadK<1>(isplain);
  108. bin.ReadK<1>(hasexceptions);
  109. bin.ReadK<6>(bits);
  110. if (Y_UNLIKELY(isplain)) {
  111. s.Skip(1);
  112. if (WithDelta) {
  113. DeltaCodec.Decode(s, b);
  114. } else {
  115. b.Append(s.data(), s.size());
  116. }
  117. } else {
  118. typename TDCodec::TDecoder decoder;
  119. if (hasexceptions) {
  120. ui64 word = 0;
  121. while (bin.Read(word, bits)) {
  122. if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) {
  123. --word;
  124. TValue t = word;
  125. if (WithDelta) {
  126. if (decoder.Decode(t)) {
  127. TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)};
  128. b.Append(r.data(), r.size());
  129. }
  130. } else {
  131. TStringBuf r{(char*)&t, sizeof(t)};
  132. b.Append(r.data(), r.size());
  133. }
  134. }
  135. }
  136. } else {
  137. ui64 word = 0;
  138. T outarr[256 / sizeof(T)];
  139. ui32 cnt = 0;
  140. while (true) {
  141. ui64 v = bin.Read(word, bits);
  142. if ((!v) | (!word))
  143. break;
  144. --word;
  145. TValue t = word;
  146. if (WithDelta) {
  147. if (decoder.Decode(t)) {
  148. outarr[cnt++] = decoder.Result;
  149. }
  150. } else {
  151. outarr[cnt++] = t;
  152. }
  153. if (cnt == Y_ARRAY_SIZE(outarr)) {
  154. b.Append((const char*)outarr, sizeof(outarr));
  155. cnt = 0;
  156. }
  157. }
  158. if (cnt) {
  159. b.Append((const char*)outarr, cnt * sizeof(T));
  160. }
  161. }
  162. }
  163. }
  164. protected:
  165. void DoLearn(ISequenceReader&) override {
  166. }
  167. };
  168. }