bit.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. #pragma once
  2. #include <util/generic/array_ref.h>
  3. #include <util/generic/vector.h>
  4. #include <util/stream/output.h>
  5. #include <util/stream/input.h>
  6. #include "huff.h"
  7. #include "compressor.h"
  8. #include "metainfo.h"
  9. namespace NCompProto {
  10. struct TBitBuffer {
  11. TVector<ui8> Out;
  12. ui64 Position;
  13. ui64 Size;
  14. ui8 Counter;
  15. TBitBuffer() {
  16. Clear();
  17. }
  18. static ui64 Read(const ui8* out, ui64 position, size_t size) {
  19. const ui8* dst = out + (size_t(position >> 3));
  20. ui64 outCode = *reinterpret_cast<const ui64*>(dst);
  21. ui8 shift = position & 7;
  22. return ((1ULL << size) - 1) & (outCode >> shift);
  23. }
  24. ui64 Read(ui64 position, size_t size) {
  25. return Read(&Out[0], position, size);
  26. }
  27. void Code(ui64 value, size_t size) {
  28. if (++Counter == 0) {
  29. Junk(257 * 64);
  30. }
  31. Position = Code(value, size, Position);
  32. }
  33. ui64 Code(ui64 value, size_t size, ui64 position) {
  34. ui8* dst = &Out[size_t(position >> 3)];
  35. ui64& outCode = *(ui64*)dst;
  36. ui8 shift = position & 7;
  37. ui64 mask = ((1ULL << size) - 1) << shift;
  38. outCode = ((value << shift) & mask) | (outCode & ~mask);
  39. return position + size;
  40. }
  41. void Junk(size_t junk = 1024) {
  42. size_t need = size_t(Position >> 3);
  43. if (Out.size() * 8 < Position + junk) {
  44. Out.resize(((need + junk) * 3) / 2);
  45. Size = Out.size();
  46. }
  47. }
  48. void Insert(ui64 value, ui64 position, size_t size) {
  49. ui64 newVal = Read(position, 56);
  50. position = Code(value, size, position);
  51. value = newVal;
  52. while (position < Position + 64) {
  53. newVal = Read(position + 56 - size, 56);
  54. position = Code(value, 56, position);
  55. value = newVal;
  56. }
  57. Position += size;
  58. }
  59. size_t ByteLength() const {
  60. return (Position + 7) / 8;
  61. }
  62. void ResetPosition() {
  63. Position = 0;
  64. Size = 0;
  65. }
  66. void Clear() {
  67. Counter = 0;
  68. Position = 0;
  69. Size = 0;
  70. Out.clear();
  71. Junk();
  72. }
  73. TArrayRef<const char> AsDataRegion() const {
  74. return TArrayRef<const char>(reinterpret_cast<const char*>(Out.data()), ByteLength());
  75. }
  76. };
  77. struct THuff {
  78. TCoder Coder;
  79. ui64 Position;
  80. THuff() {
  81. Position = (ui64)(-1);
  82. }
  83. void Load(IInputStream& stream) {
  84. TString name;
  85. Coder.InitDefault();
  86. Coder.Entries.clear();
  87. while (1) {
  88. stream >> name;
  89. if (name == "end")
  90. return;
  91. else if (name == "entry") {
  92. TCoderEntry entry;
  93. ui32 value;
  94. stream >> value;
  95. entry.MinValue = value;
  96. stream >> value;
  97. entry.Prefix = value;
  98. stream >> value;
  99. entry.PrefixBits = value;
  100. stream >> value;
  101. entry.AllBits = value;
  102. Coder.Entries.push_back(entry);
  103. }
  104. }
  105. }
  106. void Save(IOutputStream& stream, TString offset) {
  107. TString step = " ";
  108. for (size_t i = 0; i < Coder.Entries.size(); ++i) {
  109. stream << offset << step << "entry ";
  110. stream << (ui32)Coder.Entries[i].MinValue << " ";
  111. stream << (ui32)Coder.Entries[i].Prefix << " ";
  112. stream << (ui32)Coder.Entries[i].PrefixBits << " ";
  113. stream << (ui32)Coder.Entries[i].AllBits << " ";
  114. stream << Endl;
  115. }
  116. stream << offset << "end" << Endl;
  117. }
  118. void BeginElement(TBitBuffer& out) {
  119. Position = out.Position;
  120. }
  121. void Add(ui32 value, TBitBuffer& out) {
  122. size_t val = 0;
  123. ui64 code = Coder.Code(value, val);
  124. out.Code(code, val);
  125. }
  126. void AddDelayed(ui32 value, TBitBuffer& out) {
  127. size_t val = 0;
  128. ui64 code = Coder.Code(value, val);
  129. if (Position == (ui64)(-1)) {
  130. ythrow yexception() << "Position == (ui64)(-1)";
  131. }
  132. out.Insert(code, Position, val);
  133. out.Junk();
  134. }
  135. };
  136. struct THist {
  137. TAccum Accum;
  138. THist() {
  139. }
  140. void Load(IInputStream& stream) {
  141. TString name;
  142. while (1) {
  143. stream >> name;
  144. if (name == "end")
  145. return;
  146. }
  147. }
  148. // TODO: why not const TString& ???
  149. void Save(IOutputStream& /*stream*/, TString /*offset*/) {
  150. }
  151. void Add(ui32 value, TEmpty& /*empty*/) {
  152. Accum.Add(value);
  153. }
  154. void AddDelayed(ui32 value, TEmpty& /*empty*/) {
  155. Accum.Add(value);
  156. }
  157. void BeginElement(TEmpty& /*empty*/) {
  158. }
  159. };
  160. struct THistToHuff {
  161. static THistToHuff Instance() {
  162. return THistToHuff();
  163. }
  164. TEmpty Build() const {
  165. return TEmpty();
  166. }
  167. void Build(THuff& info, const THist& hist) const {
  168. Analyze(hist.Accum, info.Coder.Entries);
  169. info.Coder.Normalize();
  170. }
  171. };
  172. struct IDecompressor {
  173. // sequentially decompresses whole structure according to metainfo, starts at position offset
  174. virtual void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) = 0;
  175. // decompresses one record of outer repeated structure starting at position offset
  176. virtual void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) = 0;
  177. virtual ~IDecompressor() = default;
  178. };
  179. template <class X>
  180. struct TMetaIterator: public IDecompressor {
  181. X Self;
  182. TMetaIterator() {
  183. Self.Parent = this;
  184. }
  185. private:
  186. inline void DecompressSingle(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) {
  187. Self.BeginElement(repeatedIndex);
  188. ui32 mask = table->Mask.Decompress(codes, offset);
  189. size_t index = 0;
  190. ui32 scalarMask = table->ScalarMask;
  191. while (mask || scalarMask) {
  192. if (mask & 1) {
  193. if (scalarMask & 1) {
  194. ui32 val = table->Scalar[index].Decompress(codes, offset);
  195. Self.SetScalar(index, val);
  196. } else {
  197. Self.GetDecompressor(index).Decompress(table->Repeated[index].Get(), codes, offset);
  198. }
  199. } else if ((scalarMask & 1) && table->Default[index].Type == TScalarDefaultValue::Fixed) {
  200. Self.SetScalar(index, table->Default[index].Value);
  201. }
  202. scalarMask = scalarMask >> 1;
  203. mask = mask >> 1;
  204. ++index;
  205. }
  206. Self.EndElement();
  207. }
  208. inline void DecompressSingleScalarsOnly(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) {
  209. Self.BeginElement(repeatedIndex);
  210. ui32 mask = table->Mask.Decompress(codes, offset);
  211. ui32 scalarMask = table->ScalarMask;
  212. size_t index = 0;
  213. while (scalarMask) {
  214. if (mask & 1) {
  215. ui32 val = table->Scalar[index].Decompress(codes, offset);
  216. Self.SetScalar(index, val);
  217. } else if (table->Default[index].Type == TScalarDefaultValue::Fixed) {
  218. Self.SetScalar(index, table->Default[index].Value);
  219. }
  220. mask = mask >> 1;
  221. scalarMask = scalarMask >> 1;
  222. ++index;
  223. }
  224. Self.EndElement();
  225. }
  226. public:
  227. void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) override {
  228. ui64 locOffset = offset;
  229. ui32 count = table->Count.Decompress(codes, locOffset);
  230. ui32 repeatedIndex = (ui32)(-1);
  231. Self.BeginSelf(count, table->Id);
  232. if (table->RepeatedMask) {
  233. for (ui32 i = 0; i < count; ++i) {
  234. repeatedIndex += table->Index.Decompress(codes, locOffset);
  235. DecompressSingle(repeatedIndex, table, codes, locOffset);
  236. }
  237. } else {
  238. for (ui32 i = 0; i < count; ++i) {
  239. repeatedIndex += table->Index.Decompress(codes, locOffset);
  240. DecompressSingleScalarsOnly(repeatedIndex, table, codes, locOffset);
  241. }
  242. }
  243. offset = locOffset;
  244. Self.EndSelf();
  245. }
  246. // XXX: iterator needed?
  247. //
  248. // Following two functions serves the purpose of decompressing outer repeated-structure(such structure is mandatory now).
  249. // They can work for inner repeated structures too, if you supply correct table, codes and offset parameters.
  250. void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) override {
  251. table->Index.Decompress(codes, offset);
  252. DecompressSingle(prevIndex, table, codes, offset);
  253. }
  254. ui32 DecompressCount(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) {
  255. return table->Count.Decompress(codes, offset);
  256. }
  257. };
  258. template <class X>
  259. struct TParentHold {
  260. TMetaIterator<X>* Parent;
  261. TParentHold()
  262. : Parent(nullptr)
  263. {
  264. }
  265. };
  266. struct TEmptyDecompressor: public TParentHold<TEmptyDecompressor> {
  267. void BeginSelf(ui32 /*count*/, ui32 /*id*/) {
  268. }
  269. void EndSelf() {
  270. }
  271. void BeginElement(ui32 /*element*/) {
  272. }
  273. void EndElement() {
  274. }
  275. void SetScalar(size_t /*index*/, ui32 /*val*/) {
  276. }
  277. TMetaIterator<TEmptyDecompressor>& GetDecompressor(size_t index) {
  278. Y_UNUSED(index);
  279. return *Parent;
  280. }
  281. };
  282. inline TMetaIterator<TEmptyDecompressor>& GetEmptyDecompressor() {
  283. static TMetaIterator<TEmptyDecompressor> empty;
  284. return empty;
  285. }
  286. struct THuffToTable {
  287. static THuffToTable Instance() {
  288. return THuffToTable();
  289. }
  290. void Build(TTable& table, const THuff& huff) const {
  291. // can happen if a field was never serialized across whole structure
  292. if (huff.Coder.Entries.empty())
  293. return;
  294. for (ui32 i = 0; i < 64; ++i) {
  295. const TCoderEntry& entry = huff.Coder.GetEntry(i, table.Id[i]);
  296. table.CodeBase[i] = entry.MinValue;
  297. table.PrefLength[i] = entry.PrefixBits;
  298. table.CodeMask[i] = (1ULL << (entry.AllBits - entry.PrefixBits)) - 1ULL;
  299. table.Length[i] = entry.AllBits;
  300. }
  301. }
  302. };
  303. struct THuffToTableWithDecompressor: private THuffToTable {
  304. TSimpleSharedPtr<IDecompressor> Decompressor;
  305. THuffToTableWithDecompressor(TSimpleSharedPtr<IDecompressor> decompressor)
  306. : Decompressor(decompressor)
  307. {
  308. }
  309. TSimpleSharedPtr<IDecompressor> Build() const {
  310. return Decompressor;
  311. }
  312. void Build(TTable& table, const THuff& huff) const {
  313. THuffToTable::Build(table, huff);
  314. }
  315. };
  316. }