123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- #pragma once
- #include <util/generic/array_ref.h>
- #include <util/generic/vector.h>
- #include <util/stream/output.h>
- #include <util/stream/input.h>
- #include "huff.h"
- #include "compressor.h"
- #include "metainfo.h"
- namespace NCompProto {
- struct TBitBuffer {
- TVector<ui8> Out;
- ui64 Position;
- ui64 Size;
- ui8 Counter;
- TBitBuffer() {
- Clear();
- }
- static ui64 Read(const ui8* out, ui64 position, size_t size) {
- const ui8* dst = out + (size_t(position >> 3));
- ui64 outCode = *reinterpret_cast<const ui64*>(dst);
- ui8 shift = position & 7;
- return ((1ULL << size) - 1) & (outCode >> shift);
- }
- ui64 Read(ui64 position, size_t size) {
- return Read(&Out[0], position, size);
- }
- void Code(ui64 value, size_t size) {
- if (++Counter == 0) {
- Junk(257 * 64);
- }
- Position = Code(value, size, Position);
- }
- ui64 Code(ui64 value, size_t size, ui64 position) {
- ui8* dst = &Out[size_t(position >> 3)];
- ui64& outCode = *(ui64*)dst;
- ui8 shift = position & 7;
- ui64 mask = ((1ULL << size) - 1) << shift;
- outCode = ((value << shift) & mask) | (outCode & ~mask);
- return position + size;
- }
- void Junk(size_t junk = 1024) {
- size_t need = size_t(Position >> 3);
- if (Out.size() * 8 < Position + junk) {
- Out.resize(((need + junk) * 3) / 2);
- Size = Out.size();
- }
- }
- void Insert(ui64 value, ui64 position, size_t size) {
- ui64 newVal = Read(position, 56);
- position = Code(value, size, position);
- value = newVal;
- while (position < Position + 64) {
- newVal = Read(position + 56 - size, 56);
- position = Code(value, 56, position);
- value = newVal;
- }
- Position += size;
- }
- size_t ByteLength() const {
- return (Position + 7) / 8;
- }
- void ResetPosition() {
- Position = 0;
- Size = 0;
- }
- void Clear() {
- Counter = 0;
- Position = 0;
- Size = 0;
- Out.clear();
- Junk();
- }
- TArrayRef<const char> AsDataRegion() const {
- return TArrayRef<const char>(reinterpret_cast<const char*>(Out.data()), ByteLength());
- }
- };
- struct THuff {
- TCoder Coder;
- ui64 Position;
- THuff() {
- Position = (ui64)(-1);
- }
- void Load(IInputStream& stream) {
- TString name;
- Coder.InitDefault();
- Coder.Entries.clear();
- while (1) {
- stream >> name;
- if (name == "end")
- return;
- else if (name == "entry") {
- TCoderEntry entry;
- ui32 value;
- stream >> value;
- entry.MinValue = value;
- stream >> value;
- entry.Prefix = value;
- stream >> value;
- entry.PrefixBits = value;
- stream >> value;
- entry.AllBits = value;
- Coder.Entries.push_back(entry);
- }
- }
- }
- void Save(IOutputStream& stream, TString offset) {
- TString step = " ";
- for (size_t i = 0; i < Coder.Entries.size(); ++i) {
- stream << offset << step << "entry ";
- stream << (ui32)Coder.Entries[i].MinValue << " ";
- stream << (ui32)Coder.Entries[i].Prefix << " ";
- stream << (ui32)Coder.Entries[i].PrefixBits << " ";
- stream << (ui32)Coder.Entries[i].AllBits << " ";
- stream << Endl;
- }
- stream << offset << "end" << Endl;
- }
- void BeginElement(TBitBuffer& out) {
- Position = out.Position;
- }
- void Add(ui32 value, TBitBuffer& out) {
- size_t val = 0;
- ui64 code = Coder.Code(value, val);
- out.Code(code, val);
- }
- void AddDelayed(ui32 value, TBitBuffer& out) {
- size_t val = 0;
- ui64 code = Coder.Code(value, val);
- if (Position == (ui64)(-1)) {
- ythrow yexception() << "Position == (ui64)(-1)";
- }
- out.Insert(code, Position, val);
- out.Junk();
- }
- };
- struct THist {
- TAccum Accum;
- THist() {
- }
- void Load(IInputStream& stream) {
- TString name;
- while (1) {
- stream >> name;
- if (name == "end")
- return;
- }
- }
- // TODO: why not const TString& ???
- void Save(IOutputStream& /*stream*/, TString /*offset*/) {
- }
- void Add(ui32 value, TEmpty& /*empty*/) {
- Accum.Add(value);
- }
- void AddDelayed(ui32 value, TEmpty& /*empty*/) {
- Accum.Add(value);
- }
- void BeginElement(TEmpty& /*empty*/) {
- }
- };
- struct THistToHuff {
- static THistToHuff Instance() {
- return THistToHuff();
- }
- TEmpty Build() const {
- return TEmpty();
- }
- void Build(THuff& info, const THist& hist) const {
- Analyze(hist.Accum, info.Coder.Entries);
- info.Coder.Normalize();
- }
- };
- struct IDecompressor {
- // sequentially decompresses whole structure according to metainfo, starts at position offset
- virtual void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) = 0;
- // decompresses one record of outer repeated structure starting at position offset
- virtual void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) = 0;
- virtual ~IDecompressor() = default;
- };
- template <class X>
- struct TMetaIterator: public IDecompressor {
- X Self;
- TMetaIterator() {
- Self.Parent = this;
- }
- private:
- inline void DecompressSingle(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) {
- Self.BeginElement(repeatedIndex);
- ui32 mask = table->Mask.Decompress(codes, offset);
- size_t index = 0;
- ui32 scalarMask = table->ScalarMask;
- while (mask || scalarMask) {
- if (mask & 1) {
- if (scalarMask & 1) {
- ui32 val = table->Scalar[index].Decompress(codes, offset);
- Self.SetScalar(index, val);
- } else {
- Self.GetDecompressor(index).Decompress(table->Repeated[index].Get(), codes, offset);
- }
- } else if ((scalarMask & 1) && table->Default[index].Type == TScalarDefaultValue::Fixed) {
- Self.SetScalar(index, table->Default[index].Value);
- }
- scalarMask = scalarMask >> 1;
- mask = mask >> 1;
- ++index;
- }
- Self.EndElement();
- }
- inline void DecompressSingleScalarsOnly(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) {
- Self.BeginElement(repeatedIndex);
- ui32 mask = table->Mask.Decompress(codes, offset);
- ui32 scalarMask = table->ScalarMask;
- size_t index = 0;
- while (scalarMask) {
- if (mask & 1) {
- ui32 val = table->Scalar[index].Decompress(codes, offset);
- Self.SetScalar(index, val);
- } else if (table->Default[index].Type == TScalarDefaultValue::Fixed) {
- Self.SetScalar(index, table->Default[index].Value);
- }
- mask = mask >> 1;
- scalarMask = scalarMask >> 1;
- ++index;
- }
- Self.EndElement();
- }
- public:
- void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) override {
- ui64 locOffset = offset;
- ui32 count = table->Count.Decompress(codes, locOffset);
- ui32 repeatedIndex = (ui32)(-1);
- Self.BeginSelf(count, table->Id);
- if (table->RepeatedMask) {
- for (ui32 i = 0; i < count; ++i) {
- repeatedIndex += table->Index.Decompress(codes, locOffset);
- DecompressSingle(repeatedIndex, table, codes, locOffset);
- }
- } else {
- for (ui32 i = 0; i < count; ++i) {
- repeatedIndex += table->Index.Decompress(codes, locOffset);
- DecompressSingleScalarsOnly(repeatedIndex, table, codes, locOffset);
- }
- }
- offset = locOffset;
- Self.EndSelf();
- }
- // XXX: iterator needed?
- //
- // Following two functions serves the purpose of decompressing outer repeated-structure(such structure is mandatory now).
- // They can work for inner repeated structures too, if you supply correct table, codes and offset parameters.
- void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) override {
- table->Index.Decompress(codes, offset);
- DecompressSingle(prevIndex, table, codes, offset);
- }
- ui32 DecompressCount(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) {
- return table->Count.Decompress(codes, offset);
- }
- };
- template <class X>
- struct TParentHold {
- TMetaIterator<X>* Parent;
- TParentHold()
- : Parent(nullptr)
- {
- }
- };
- struct TEmptyDecompressor: public TParentHold<TEmptyDecompressor> {
- void BeginSelf(ui32 /*count*/, ui32 /*id*/) {
- }
- void EndSelf() {
- }
- void BeginElement(ui32 /*element*/) {
- }
- void EndElement() {
- }
- void SetScalar(size_t /*index*/, ui32 /*val*/) {
- }
- TMetaIterator<TEmptyDecompressor>& GetDecompressor(size_t index) {
- Y_UNUSED(index);
- return *Parent;
- }
- };
- inline TMetaIterator<TEmptyDecompressor>& GetEmptyDecompressor() {
- static TMetaIterator<TEmptyDecompressor> empty;
- return empty;
- }
- struct THuffToTable {
- static THuffToTable Instance() {
- return THuffToTable();
- }
- void Build(TTable& table, const THuff& huff) const {
- // can happen if a field was never serialized across whole structure
- if (huff.Coder.Entries.empty())
- return;
- for (ui32 i = 0; i < 64; ++i) {
- const TCoderEntry& entry = huff.Coder.GetEntry(i, table.Id[i]);
- table.CodeBase[i] = entry.MinValue;
- table.PrefLength[i] = entry.PrefixBits;
- table.CodeMask[i] = (1ULL << (entry.AllBits - entry.PrefixBits)) - 1ULL;
- table.Length[i] = entry.AllBits;
- }
- }
- };
- struct THuffToTableWithDecompressor: private THuffToTable {
- TSimpleSharedPtr<IDecompressor> Decompressor;
- THuffToTableWithDecompressor(TSimpleSharedPtr<IDecompressor> decompressor)
- : Decompressor(decompressor)
- {
- }
- TSimpleSharedPtr<IDecompressor> Build() const {
- return Decompressor;
- }
- void Build(TTable& table, const THuff& huff) const {
- THuffToTable::Build(table, huff);
- }
- };
- }
|