#pragma once #include #include #include #include #include "huff.h" #include "compressor.h" #include "metainfo.h" namespace NCompProto { struct TBitBuffer { TVector Out; ui64 Position; ui64 Size; ui8 Counter; TBitBuffer() { Clear(); } static ui64 Read(const ui8* out, ui64 position, size_t size) { const ui8* dst = out + (size_t(position >> 3)); ui64 outCode = *reinterpret_cast(dst); ui8 shift = position & 7; return ((1ULL << size) - 1) & (outCode >> shift); } ui64 Read(ui64 position, size_t size) { return Read(&Out[0], position, size); } void Code(ui64 value, size_t size) { if (++Counter == 0) { Junk(257 * 64); } Position = Code(value, size, Position); } ui64 Code(ui64 value, size_t size, ui64 position) { ui8* dst = &Out[size_t(position >> 3)]; ui64& outCode = *(ui64*)dst; ui8 shift = position & 7; ui64 mask = ((1ULL << size) - 1) << shift; outCode = ((value << shift) & mask) | (outCode & ~mask); return position + size; } void Junk(size_t junk = 1024) { size_t need = size_t(Position >> 3); if (Out.size() * 8 < Position + junk) { Out.resize(((need + junk) * 3) / 2); Size = Out.size(); } } void Insert(ui64 value, ui64 position, size_t size) { ui64 newVal = Read(position, 56); position = Code(value, size, position); value = newVal; while (position < Position + 64) { newVal = Read(position + 56 - size, 56); position = Code(value, 56, position); value = newVal; } Position += size; } size_t ByteLength() const { return (Position + 7) / 8; } void ResetPosition() { Position = 0; Size = 0; } void Clear() { Counter = 0; Position = 0; Size = 0; Out.clear(); Junk(); } TArrayRef AsDataRegion() const { return TArrayRef(reinterpret_cast(Out.data()), ByteLength()); } }; struct THuff { TCoder Coder; ui64 Position; THuff() { Position = (ui64)(-1); } void Load(IInputStream& stream) { TString name; Coder.InitDefault(); Coder.Entries.clear(); while (1) { stream >> name; if (name == "end") return; else if (name == "entry") { TCoderEntry entry; ui32 value; stream >> value; entry.MinValue = value; stream >> value; entry.Prefix = value; stream >> value; entry.PrefixBits = value; stream >> value; entry.AllBits = value; Coder.Entries.push_back(entry); } } } void Save(IOutputStream& stream, TString offset) { TString step = " "; for (size_t i = 0; i < Coder.Entries.size(); ++i) { stream << offset << step << "entry "; stream << (ui32)Coder.Entries[i].MinValue << " "; stream << (ui32)Coder.Entries[i].Prefix << " "; stream << (ui32)Coder.Entries[i].PrefixBits << " "; stream << (ui32)Coder.Entries[i].AllBits << " "; stream << Endl; } stream << offset << "end" << Endl; } void BeginElement(TBitBuffer& out) { Position = out.Position; } void Add(ui32 value, TBitBuffer& out) { size_t val = 0; ui64 code = Coder.Code(value, val); out.Code(code, val); } void AddDelayed(ui32 value, TBitBuffer& out) { size_t val = 0; ui64 code = Coder.Code(value, val); if (Position == (ui64)(-1)) { ythrow yexception() << "Position == (ui64)(-1)"; } out.Insert(code, Position, val); out.Junk(); } }; struct THist { TAccum Accum; THist() { } void Load(IInputStream& stream) { TString name; while (1) { stream >> name; if (name == "end") return; } } // TODO: why not const TString& ??? void Save(IOutputStream& /*stream*/, TString /*offset*/) { } void Add(ui32 value, TEmpty& /*empty*/) { Accum.Add(value); } void AddDelayed(ui32 value, TEmpty& /*empty*/) { Accum.Add(value); } void BeginElement(TEmpty& /*empty*/) { } }; struct THistToHuff { static THistToHuff Instance() { return THistToHuff(); } TEmpty Build() const { return TEmpty(); } void Build(THuff& info, const THist& hist) const { Analyze(hist.Accum, info.Coder.Entries); info.Coder.Normalize(); } }; struct IDecompressor { // sequentially decompresses whole structure according to metainfo, starts at position offset virtual void Decompress(const TMetaInfo* table, const ui8* codes, ui64& offset) = 0; // decompresses one record of outer repeated structure starting at position offset virtual void DecompressOne(const TMetaInfo* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) = 0; virtual ~IDecompressor() = default; }; template struct TMetaIterator: public IDecompressor { X Self; TMetaIterator() { Self.Parent = this; } private: inline void DecompressSingle(ui32 repeatedIndex, const TMetaInfo* table, const ui8* codes, ui64& offset) { Self.BeginElement(repeatedIndex); ui32 mask = table->Mask.Decompress(codes, offset); size_t index = 0; ui32 scalarMask = table->ScalarMask; while (mask || scalarMask) { if (mask & 1) { if (scalarMask & 1) { ui32 val = table->Scalar[index].Decompress(codes, offset); Self.SetScalar(index, val); } else { Self.GetDecompressor(index).Decompress(table->Repeated[index].Get(), codes, offset); } } else if ((scalarMask & 1) && table->Default[index].Type == TScalarDefaultValue::Fixed) { Self.SetScalar(index, table->Default[index].Value); } scalarMask = scalarMask >> 1; mask = mask >> 1; ++index; } Self.EndElement(); } inline void DecompressSingleScalarsOnly(ui32 repeatedIndex, const TMetaInfo* table, const ui8* codes, ui64& offset) { Self.BeginElement(repeatedIndex); ui32 mask = table->Mask.Decompress(codes, offset); ui32 scalarMask = table->ScalarMask; size_t index = 0; while (scalarMask) { if (mask & 1) { ui32 val = table->Scalar[index].Decompress(codes, offset); Self.SetScalar(index, val); } else if (table->Default[index].Type == TScalarDefaultValue::Fixed) { Self.SetScalar(index, table->Default[index].Value); } mask = mask >> 1; scalarMask = scalarMask >> 1; ++index; } Self.EndElement(); } public: void Decompress(const TMetaInfo* table, const ui8* codes, ui64& offset) override { ui64 locOffset = offset; ui32 count = table->Count.Decompress(codes, locOffset); ui32 repeatedIndex = (ui32)(-1); Self.BeginSelf(count, table->Id); if (table->RepeatedMask) { for (ui32 i = 0; i < count; ++i) { repeatedIndex += table->Index.Decompress(codes, locOffset); DecompressSingle(repeatedIndex, table, codes, locOffset); } } else { for (ui32 i = 0; i < count; ++i) { repeatedIndex += table->Index.Decompress(codes, locOffset); DecompressSingleScalarsOnly(repeatedIndex, table, codes, locOffset); } } offset = locOffset; Self.EndSelf(); } // XXX: iterator needed? // // Following two functions serves the purpose of decompressing outer repeated-structure(such structure is mandatory now). // They can work for inner repeated structures too, if you supply correct table, codes and offset parameters. void DecompressOne(const TMetaInfo* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) override { table->Index.Decompress(codes, offset); DecompressSingle(prevIndex, table, codes, offset); } ui32 DecompressCount(const TMetaInfo* table, const ui8* codes, ui64& offset) { return table->Count.Decompress(codes, offset); } }; template struct TParentHold { TMetaIterator* Parent; TParentHold() : Parent(nullptr) { } }; struct TEmptyDecompressor: public TParentHold { void BeginSelf(ui32 /*count*/, ui32 /*id*/) { } void EndSelf() { } void BeginElement(ui32 /*element*/) { } void EndElement() { } void SetScalar(size_t /*index*/, ui32 /*val*/) { } TMetaIterator& GetDecompressor(size_t index) { Y_UNUSED(index); return *Parent; } }; inline TMetaIterator& GetEmptyDecompressor() { static TMetaIterator empty; return empty; } struct THuffToTable { static THuffToTable Instance() { return THuffToTable(); } void Build(TTable& table, const THuff& huff) const { // can happen if a field was never serialized across whole structure if (huff.Coder.Entries.empty()) return; for (ui32 i = 0; i < 64; ++i) { const TCoderEntry& entry = huff.Coder.GetEntry(i, table.Id[i]); table.CodeBase[i] = entry.MinValue; table.PrefLength[i] = entry.PrefixBits; table.CodeMask[i] = (1ULL << (entry.AllBits - entry.PrefixBits)) - 1ULL; table.Length[i] = entry.AllBits; } } }; struct THuffToTableWithDecompressor: private THuffToTable { TSimpleSharedPtr Decompressor; THuffToTableWithDecompressor(TSimpleSharedPtr decompressor) : Decompressor(decompressor) { } TSimpleSharedPtr Build() const { return Decompressor; } void Build(TTable& table, const THuff& huff) const { THuffToTable::Build(table, huff); } }; }