comptable.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. #include <util/system/defaults.h>
  2. #include <util/system/filemap.h>
  3. #include <util/generic/string.h>
  4. #include <util/generic/vector.h>
  5. #include <util/string/cast.h>
  6. #include <util/stream/file.h>
  7. #include <library/cpp/compproto/huff.h>
  8. #include "comptable.h"
  9. #include <cstdlib>
  10. namespace NCompTable {
  11. static const ui32 magicHashMul = 0xd077cd1f;
  12. static const size_t hashSizeLog = 18;
  13. static const size_t hashSize = 1 << hashSizeLog;
  14. size_t HashIndex(ui32 value, ui32 hashMul) {
  15. return (value * hashMul) >> (32 - hashSizeLog);
  16. }
  17. void TCompressorTable::GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const {
  18. ui64 code = value - entry.MinValue;
  19. bitCode = (code << entry.PrefixBits) | entry.Prefix;
  20. bitLength = entry.AllBits;
  21. }
  22. void TCompressorTable::GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const {
  23. GetHuffCode(HuffCodes[9], value, bitCode, bitLength);
  24. }
  25. void TCompressorTable::GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const {
  26. for (auto huffCode : HuffCodes) {
  27. ui32 minValue = huffCode.MinValue;
  28. if (minValue <= value && value < huffCode.MaxValue()) {
  29. GetHuffCode(huffCode, value, bitCode, bitLength);
  30. return;
  31. }
  32. }
  33. abort();
  34. }
  35. ui8 TCompressorTable::GetHuffIndex(ui8 prefix) {
  36. for (size_t i = 0; i < 10; ++i) {
  37. if ((prefix & ((1 << HuffCodes[i].PrefixBits) - 1)) == HuffCodes[i].Prefix) {
  38. return i;
  39. }
  40. }
  41. abort();
  42. return 0;
  43. }
  44. void TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536]) {
  45. i64 add = 1;
  46. while (1) {
  47. if (BuildHuffCodes(totalFreq, freqs, add)) {
  48. return;
  49. }
  50. add = add * 2;
  51. }
  52. }
  53. bool TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add) {
  54. TVector<NCompProto::TCode> codes;
  55. size_t bits[] = {0, 1, 2, 4, 8, 10, 12, 14, 16, 32};
  56. size_t off = 0;
  57. ui32 total = totalFreq;
  58. for (size_t i = 0; i < 9; ++i) {
  59. size_t size = 1 << bits[i];
  60. ui32 weight = 0;
  61. for (size_t j = off; j < off + size && j < 65536; ++j) {
  62. weight += freqs[j];
  63. }
  64. codes.push_back(NCompProto::TCode(weight + add, ui32(off), bits[i]));
  65. total = total > weight ? total - weight : 0;
  66. off += size;
  67. }
  68. codes.push_back(NCompProto::TCode(total + add, 0, 32));
  69. i64 ret = NCompProto::BuildHuff(codes);
  70. Y_UNUSED(ret);
  71. for (size_t i = 0; i < codes.size(); ++i) {
  72. NCompProto::TCoderEntry& code = HuffCodes[i];
  73. code.MinValue = codes[i].Start;
  74. code.Prefix = codes[i].Prefix;
  75. code.PrefixBits = codes[i].PrefLength;
  76. code.AllBits = code.PrefixBits + codes[i].Bits;
  77. if (code.PrefixBits > 8) {
  78. return false;
  79. }
  80. }
  81. for (size_t i = 0; i < 256; ++i) {
  82. HuffIndices[i] = GetHuffIndex(ui8(i));
  83. }
  84. return true;
  85. }
  86. template <class TIterator>
  87. void Iterate(const TStringBuf& data, TIterator& iterator) {
  88. size_t i = 0;
  89. iterator.Visit(ui32(data.size()));
  90. for (; i + 3 < data.size(); i += 4) {
  91. iterator.Visit(reinterpret_cast<const ui32*>(data.data() + i)[0]);
  92. }
  93. if (i != data.size()) {
  94. ui32 buffer[1] = {0};
  95. memcpy(buffer, data.data() + i, data.size() - i);
  96. iterator.Visit(buffer[0]);
  97. }
  98. }
  99. class TDataCompressor {
  100. ui32 Keys[hashSize];
  101. ui32 Vals[hashSize];
  102. ui64 BitCodes[hashSize];
  103. ui8 BitLengths[hashSize];
  104. ui32 HashMul;
  105. const TCompressorTable Table;
  106. public:
  107. TDataCompressor(const TCompressorTable& table)
  108. : Table(table)
  109. {
  110. HashMul = table.HashMul;
  111. for (size_t i = 0; i < hashSize; ++i) {
  112. Keys[i] = 0;
  113. Vals[i] = ui32(-1);
  114. }
  115. for (size_t i = 0; i < 65536; ++i) {
  116. size_t index = HashIndex(table.Table[i], table.HashMul);
  117. Keys[index] = table.Table[i];
  118. Vals[index] = i;
  119. table.GetHuffCode(ui32(i), BitCodes[index], BitLengths[index]);
  120. }
  121. }
  122. void GetStat(ui32 val, ui32& stat) const {
  123. size_t index = HashIndex(val, HashMul);
  124. if (Keys[index] == val) {
  125. stat = Vals[index];
  126. } else {
  127. stat = ui32(-1);
  128. }
  129. }
  130. void GetStat(ui32 val, ui64& bitCode, ui8& bitLength) const {
  131. size_t index = HashIndex(val, HashMul);
  132. if (Keys[index] == val) {
  133. bitCode = BitCodes[index];
  134. bitLength = BitLengths[index];
  135. } else {
  136. Table.GetLastHuffCode(val, bitCode, bitLength);
  137. }
  138. }
  139. size_t Compress4(const ui32* data4, ui8* dataBuff) const {
  140. ui8* oldBuff = dataBuff;
  141. ++dataBuff;
  142. ui32 status = 0;
  143. for (size_t i = 0; i < 4; ++i) {
  144. status = (status << 2);
  145. ui32 data = data4[i];
  146. if (data == 0) {
  147. continue;
  148. }
  149. ui32 stat;
  150. GetStat(data, stat);
  151. if (stat < 256) {
  152. memcpy(dataBuff, &stat, 1);
  153. dataBuff += 1;
  154. status += 1;
  155. } else if (stat < 65536) {
  156. memcpy(dataBuff, &stat, 2);
  157. dataBuff += 2;
  158. status += 2;
  159. } else {
  160. memcpy(dataBuff, &data, 4);
  161. dataBuff += 4;
  162. status += 3;
  163. }
  164. }
  165. oldBuff[0] = ui8(status);
  166. return dataBuff - oldBuff;
  167. }
  168. struct TCompressorIterator {
  169. TVector<char>& Result;
  170. const TDataCompressor& Compressor;
  171. ui32 Cached[4];
  172. size_t Index;
  173. size_t RealSize;
  174. TCompressorIterator(TVector<char>& result, const TDataCompressor& compressor)
  175. : Result(result)
  176. , Compressor(compressor)
  177. , Index(0)
  178. , RealSize(0)
  179. {
  180. }
  181. void Flush() {
  182. Result.yresize(RealSize + 32);
  183. RealSize += Compressor.Compress4(Cached, reinterpret_cast<ui8*>(Result.data()) + RealSize);
  184. Index = 0;
  185. }
  186. void Visit(const ui32 data) {
  187. Cached[Index] = data;
  188. ++Index;
  189. if (Index == 4) {
  190. Flush();
  191. }
  192. }
  193. ~TCompressorIterator() {
  194. if (Index != 0) {
  195. for (size_t i = Index; i < 4; ++i) {
  196. Cached[i] = 0;
  197. }
  198. Flush();
  199. }
  200. Result.yresize(RealSize);
  201. }
  202. };
  203. struct THQCompressorIterator {
  204. TVector<char>& Result;
  205. const TDataCompressor& Compressor;
  206. size_t RealSize;
  207. ui64 Offset;
  208. THQCompressorIterator(TVector<char>& result, const TDataCompressor& compressor)
  209. : Result(result)
  210. , Compressor(compressor)
  211. , RealSize(0)
  212. , Offset(0)
  213. {
  214. }
  215. void Visit(const ui32 data) {
  216. size_t byteOff = Offset >> 3;
  217. Result.yresize(byteOff + 32);
  218. ui64 bitCode;
  219. ui8 bitLength;
  220. Compressor.GetStat(data, bitCode, bitLength);
  221. ui64 dst;
  222. memcpy(&dst, &Result[byteOff], sizeof(dst));
  223. ui64 mask = ((1ULL << (Offset & 7)) - 1ULL);
  224. dst = (dst & mask) | (bitCode << (Offset & 7));
  225. memcpy(&Result[byteOff], &dst, sizeof(dst));
  226. Offset += bitLength;
  227. }
  228. ~THQCompressorIterator() {
  229. Result.yresize((Offset + 7) >> 3);
  230. if (Offset & 7)
  231. Result.back() &= (1 << (Offset & 7)) - 1;
  232. }
  233. };
  234. template <bool HQ>
  235. void Compress(const TStringBuf& stringBuf, TVector<char>& rslt) const {
  236. if (!HQ) {
  237. TCompressorIterator iterator(rslt, *this);
  238. Iterate(stringBuf, iterator);
  239. } else {
  240. THQCompressorIterator iterator(rslt, *this);
  241. Iterate(stringBuf, iterator);
  242. }
  243. }
  244. };
  245. class TDataDecompressor {
  246. const TCompressorTable Table;
  247. public:
  248. TDataDecompressor(const TCompressorTable& table)
  249. : Table(table)
  250. {
  251. }
  252. size_t Decompress(ui32* data4, const ui8* dataBuff, ui32 type) const {
  253. if (type == 0) {
  254. data4[0] = 0;
  255. return 0;
  256. }
  257. if (type == 1) {
  258. ui8 masked;
  259. memcpy(&masked, dataBuff, sizeof(masked));
  260. data4[0] = Table.Table[masked];
  261. return 1;
  262. } else if (type == 2) {
  263. ui16 masked;
  264. memcpy(&masked, dataBuff, sizeof(masked));
  265. data4[0] = Table.Table[masked];
  266. return 2;
  267. } else {
  268. memcpy(data4, dataBuff, sizeof(*data4));
  269. return 4;
  270. }
  271. }
  272. size_t Decompress(ui32* data4, const ui8* dataBuff) const {
  273. ui32 status = *dataBuff;
  274. const ui8* oldData = dataBuff;
  275. ++dataBuff;
  276. dataBuff += Decompress(data4 + 0, dataBuff, (status >> 6));
  277. dataBuff += Decompress(data4 + 1, dataBuff, (status >> 4) & 0x3);
  278. dataBuff += Decompress(data4 + 2, dataBuff, (status >> 2) & 0x3);
  279. dataBuff += Decompress(data4 + 3, dataBuff, (status)&0x3);
  280. return dataBuff - oldData;
  281. }
  282. void Decompress(ui32* data, const ui8* dataBuff, ui64& offset, ui64 border) const {
  283. size_t off = offset >> 3;
  284. ui64 val = 0;
  285. if (border - off >= sizeof(val)) {
  286. memcpy(&val, dataBuff + off, sizeof(val));
  287. } else {
  288. memcpy(&val, dataBuff + off, border - off);
  289. }
  290. val >>= (offset & 7);
  291. ui8 index = Table.HuffIndices[ui8(val)];
  292. const NCompProto::TCoderEntry& entry = Table.HuffCodes[index];
  293. ui8 allBits = entry.AllBits;
  294. val = (val & ((1ULL << allBits) - 1ULL)) >> entry.PrefixBits;
  295. val = val + entry.MinValue;
  296. data[0] = (index == 9) ? val : Table.Table[val];
  297. offset += allBits;
  298. }
  299. size_t GetJunkSize() const {
  300. return 8;
  301. }
  302. template <bool HQ>
  303. void Decompress(const TStringBuf& dataBuf, TVector<char>& rslt) const {
  304. rslt.clear();
  305. if (dataBuf.empty()) {
  306. return;
  307. }
  308. const ui8* src = reinterpret_cast<const ui8*>(dataBuf.data());
  309. ui64 border = dataBuf.size();
  310. ui32 len = 0;
  311. ui32 nullTerm = 1;
  312. if (HQ) {
  313. ui64 offset = 0;
  314. ui32 length = 0;
  315. Decompress(&length, src, offset, border);
  316. size_t length32 = (length + 3) / 4;
  317. rslt.yresize(length32 * 4 + nullTerm);
  318. ui32* result = reinterpret_cast<ui32*>(rslt.data());
  319. len = length;
  320. for (size_t i = 0; i < length32; ++i) {
  321. Decompress(&result[i], src, offset, border);
  322. }
  323. } else {
  324. ui32 data[4];
  325. src += Decompress(data, src);
  326. len = data[0];
  327. size_t length32x4 = (4 + len + 15) / 16;
  328. rslt.yresize(length32x4 * 16 + nullTerm);
  329. ui32* result = reinterpret_cast<ui32*>(rslt.data());
  330. for (size_t i = 0; i < 3; ++i) {
  331. result[i] = data[i + 1];
  332. }
  333. for (size_t j = 1; j < length32x4; ++j) {
  334. src += Decompress(&result[j * 4 - 1], src);
  335. }
  336. }
  337. rslt.resize(len);
  338. }
  339. };
  340. struct TSamplerIterator {
  341. TDataSampler& Sampler;
  342. TSamplerIterator(TDataSampler& sampler)
  343. : Sampler(sampler)
  344. {
  345. }
  346. void Visit(const ui32 data) {
  347. Sampler.AddStat(data);
  348. }
  349. };
  350. struct TGreaterComparer {
  351. //std::greater in arcadia???
  352. bool operator()(ui64 a, ui64 b) const {
  353. return a > b;
  354. }
  355. };
  356. TDataSampler::TDataSampler() {
  357. memset(this, 0, sizeof(*this));
  358. }
  359. void TDataSampler::BuildTable(TCompressorTable& table) const {
  360. std::vector<ui64> sorted;
  361. for (size_t i = 0; i < Size; ++i) {
  362. ui64 res = (ui64(EntryHit[i]) << 32) + ui32(i);
  363. sorted.push_back(res);
  364. }
  365. std::vector<ui32> entryTbl(Size);
  366. std::sort(sorted.begin(), sorted.end(), TGreaterComparer());
  367. table.HashMul = magicHashMul;
  368. i64 freqs[65536];
  369. for (size_t i = 0; i < 65536; ++i) {
  370. ui32 ind = ui32(sorted[i]);
  371. table.Table[i] = EntryVal[ind];
  372. freqs[i] = EntryHit[ind];
  373. }
  374. table.BuildHuffCodes(Counter, freqs);
  375. }
  376. void TDataSampler::AddStat(ui32 val) {
  377. ++Counter;
  378. size_t hashInd = HashIndex(val, magicHashMul);
  379. if (EntryVal[hashInd] == val) {
  380. ++EntryHit[hashInd];
  381. } else if (EntryHit[hashInd] > 1) {
  382. --EntryHit[hashInd];
  383. } else {
  384. EntryHit[hashInd] = 1;
  385. EntryVal[hashInd] = val;
  386. }
  387. }
  388. void TDataSampler::AddStat(const TStringBuf& stringBuf) {
  389. TSamplerIterator iterator(*this);
  390. Iterate(stringBuf, iterator);
  391. }
  392. TChunkCompressor::TChunkCompressor(bool highQuality, const TCompressorTable& table)
  393. : HighQuality(highQuality)
  394. {
  395. Compressor.Reset(new TDataCompressor(table));
  396. }
  397. void TChunkCompressor::Compress(TStringBuf data, TVector<char>* rslt) const {
  398. if (HighQuality) {
  399. Compressor->Compress<1>(data, *rslt);
  400. } else {
  401. Compressor->Compress<0>(data, *rslt);
  402. }
  403. }
  404. TChunkCompressor::~TChunkCompressor() = default;
  405. TChunkDecompressor::TChunkDecompressor(bool highQuality, const TCompressorTable& table)
  406. : HighQuality(highQuality)
  407. {
  408. Decompressor.Reset(new TDataDecompressor(table));
  409. }
  410. void TChunkDecompressor::Decompress(TStringBuf data, TVector<char>* rslt) const {
  411. if (HighQuality) {
  412. Decompressor->Decompress<1>(data, *rslt);
  413. } else {
  414. Decompressor->Decompress<0>(data, *rslt);
  415. }
  416. }
  417. TChunkDecompressor::~TChunkDecompressor() = default;
  418. }