123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244 |
- #pragma once
- #include "codecs.h"
- #include <library/cpp/containers/comptrie/comptrie_trie.h>
- #include <library/cpp/codecs/greedy_dict/gd_builder.h>
- #include <util/string/cast.h>
- #include <util/string/escape.h>
- namespace NCodecs {
- // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
- // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
- struct TVarIntTraits {
- static const size_t MAX_VARINT32_BYTES = 5;
- static void Write(ui32 value, TBuffer& b) {
- while (value > 0x7F) {
- b.Append(static_cast<ui8>(value) | 0x80);
- value >>= 7;
- }
- b.Append(static_cast<ui8>(value) & 0x7F);
- }
- static void Read(TStringBuf& r, ui32& value) {
- ui32 result = 0;
- for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
- const ui32 b = static_cast<ui8>(r[0]);
- r.Skip(1);
- result |= static_cast<ui32>(b & 0x7F) << (7 * count);
- if (!(b & 0x80)) {
- value = result;
- return;
- } else if (Y_UNLIKELY(r.empty())) {
- break;
- }
- }
- Y_ENSURE_EX(false, TCodecException() << "Bad data");
- }
- };
- struct TShortIntTraits {
- static const size_t SHORTINT_SIZE_LIMIT = 0x8000;
- Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
- Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
- if (value >= 0x80) {
- b.Append(static_cast<ui8>(value >> 8) | 0x80);
- }
- b.Append(static_cast<ui8>(value));
- }
- Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
- ui32 result = static_cast<ui8>(r[0]);
- r.Skip(1);
- if (result >= 0x80) {
- Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
- result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
- r.Skip(1);
- }
- value = result;
- }
- };
- class TSolarCodec: public ICodec {
- public:
- static TStringBuf MyName8k() {
- return TStringBuf("solar-8k");
- }
- static TStringBuf MyName16k() {
- return TStringBuf("solar-16k");
- }
- static TStringBuf MyName32k() {
- return TStringBuf("solar-32k");
- }
- static TStringBuf MyName64k() {
- return TStringBuf("solar-64k");
- }
- static TStringBuf MyName256k() {
- return TStringBuf("solar-256k");
- }
- static TStringBuf MyName() {
- return TStringBuf("solar");
- }
- static TStringBuf MyName8kAdapt() {
- return TStringBuf("solar-8k-a");
- }
- static TStringBuf MyName16kAdapt() {
- return TStringBuf("solar-16k-a");
- }
- static TStringBuf MyName32kAdapt() {
- return TStringBuf("solar-32k-a");
- }
- static TStringBuf MyName64kAdapt() {
- return TStringBuf("solar-64k-a");
- }
- static TStringBuf MyName256kAdapt() {
- return TStringBuf("solar-256k-a");
- }
- static TStringBuf MyNameShortInt() {
- return TStringBuf("solar-si");
- }
- explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
- : Settings(s)
- , MaxEntries(maxentries)
- , MaxIterations(maxiter)
- {
- MyTraits.NeedsTraining = true;
- MyTraits.SizeOnDecodeMultiplier = 2;
- MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
- }
- ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
- EncodeImpl<TVarIntTraits>(r, b);
- return 0;
- }
- void Decode(TStringBuf r, TBuffer& b) const override {
- DecodeImpl<TVarIntTraits>(r, b);
- }
- TString GetName() const override {
- return ToString(MyName());
- }
- protected:
- void DoLearn(ISequenceReader&) override;
- void Save(IOutputStream*) const override;
- void Load(IInputStream*) override;
- Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
- return TStringBuf(Pool.Data() + begoff, endoff - begoff);
- }
- Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
- return SubStr(Decoder[num - 1], Decoder[num]);
- }
- template <class TTraits>
- Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
- b.Clear();
- b.Reserve(r.size());
- while (!r.empty()) {
- size_t sz = 0;
- ui32 val = (ui32)-1;
- Encoder.FindLongestPrefix(r, &sz, &val);
- TTraits::Write(val + 1, b);
- r.Skip(Max<size_t>(sz, 1));
- }
- }
- template <class TTraits>
- Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
- b.Clear();
- b.Reserve(r.size());
- ui32 v = 0;
- while (!r.empty()) {
- TTraits::Read(r, v);
- TStringBuf s = DoDecode(v);
- b.Append(s.data(), s.size());
- }
- }
- inline bool CanUseShortInt() const {
- return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
- }
- private:
- typedef TCompactTrie<char, ui32> TEncoder;
- typedef TVector<ui32> TDecoder;
- TBuffer Pool;
- TEncoder Encoder;
- TDecoder Decoder;
- NGreedyDict::TBuildSettings Settings;
- ui32 MaxEntries;
- ui32 MaxIterations;
- };
- // Uses varints or shortints depending on the decoder size
- class TAdaptiveSolarCodec: public TSolarCodec {
- public:
- explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
- : TSolarCodec(maxentries, maxiter, s)
- {
- }
- ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
- if (CanUseShortInt()) {
- EncodeImpl<TShortIntTraits>(r, b);
- } else {
- EncodeImpl<TVarIntTraits>(r, b);
- }
- return 0;
- }
- void Decode(TStringBuf r, TBuffer& b) const override {
- if (CanUseShortInt()) {
- DecodeImpl<TShortIntTraits>(r, b);
- } else {
- DecodeImpl<TVarIntTraits>(r, b);
- }
- }
- TString GetName() const override {
- if (CanUseShortInt()) {
- return ToString(MyNameShortInt());
- } else {
- return ToString(MyName());
- }
- }
- };
- class TSolarCodecShortInt: public TSolarCodec {
- public:
- explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
- : TSolarCodec(maxentries, maxiter, s)
- {
- }
- ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
- EncodeImpl<TShortIntTraits>(r, b);
- return 0;
- }
- void Decode(TStringBuf r, TBuffer& b) const override {
- DecodeImpl<TShortIntTraits>(r, b);
- }
- TString GetName() const override {
- return ToString(MyNameShortInt());
- }
- protected:
- void Load(IInputStream* in) override {
- TSolarCodec::Load(in);
- Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
- }
- };
- }
|