solar_codec.h 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. #pragma once
  2. #include "codecs.h"
  3. #include <library/cpp/containers/comptrie/comptrie_trie.h>
  4. #include <library/cpp/codecs/greedy_dict/gd_builder.h>
  5. #include <util/string/cast.h>
  6. #include <util/string/escape.h>
  7. namespace NCodecs {
  8. // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
  9. // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
  10. struct TVarIntTraits {
  11. static const size_t MAX_VARINT32_BYTES = 5;
  12. static void Write(ui32 value, TBuffer& b) {
  13. while (value > 0x7F) {
  14. b.Append(static_cast<ui8>(value) | 0x80);
  15. value >>= 7;
  16. }
  17. b.Append(static_cast<ui8>(value) & 0x7F);
  18. }
  19. static void Read(TStringBuf& r, ui32& value) {
  20. ui32 result = 0;
  21. for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
  22. const ui32 b = static_cast<ui8>(r[0]);
  23. r.Skip(1);
  24. result |= static_cast<ui32>(b & 0x7F) << (7 * count);
  25. if (!(b & 0x80)) {
  26. value = result;
  27. return;
  28. } else if (Y_UNLIKELY(r.empty())) {
  29. break;
  30. }
  31. }
  32. Y_ENSURE_EX(false, TCodecException() << "Bad data");
  33. }
  34. };
  35. struct TShortIntTraits {
  36. static const size_t SHORTINT_SIZE_LIMIT = 0x8000;
  37. Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
  38. Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
  39. if (value >= 0x80) {
  40. b.Append(static_cast<ui8>(value >> 8) | 0x80);
  41. }
  42. b.Append(static_cast<ui8>(value));
  43. }
  44. Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
  45. ui32 result = static_cast<ui8>(r[0]);
  46. r.Skip(1);
  47. if (result >= 0x80) {
  48. Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
  49. result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
  50. r.Skip(1);
  51. }
  52. value = result;
  53. }
  54. };
  55. class TSolarCodec: public ICodec {
  56. public:
  57. static TStringBuf MyName8k() {
  58. return TStringBuf("solar-8k");
  59. }
  60. static TStringBuf MyName16k() {
  61. return TStringBuf("solar-16k");
  62. }
  63. static TStringBuf MyName32k() {
  64. return TStringBuf("solar-32k");
  65. }
  66. static TStringBuf MyName64k() {
  67. return TStringBuf("solar-64k");
  68. }
  69. static TStringBuf MyName256k() {
  70. return TStringBuf("solar-256k");
  71. }
  72. static TStringBuf MyName() {
  73. return TStringBuf("solar");
  74. }
  75. static TStringBuf MyName8kAdapt() {
  76. return TStringBuf("solar-8k-a");
  77. }
  78. static TStringBuf MyName16kAdapt() {
  79. return TStringBuf("solar-16k-a");
  80. }
  81. static TStringBuf MyName32kAdapt() {
  82. return TStringBuf("solar-32k-a");
  83. }
  84. static TStringBuf MyName64kAdapt() {
  85. return TStringBuf("solar-64k-a");
  86. }
  87. static TStringBuf MyName256kAdapt() {
  88. return TStringBuf("solar-256k-a");
  89. }
  90. static TStringBuf MyNameShortInt() {
  91. return TStringBuf("solar-si");
  92. }
  93. explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
  94. : Settings(s)
  95. , MaxEntries(maxentries)
  96. , MaxIterations(maxiter)
  97. {
  98. MyTraits.NeedsTraining = true;
  99. MyTraits.SizeOnDecodeMultiplier = 2;
  100. MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
  101. }
  102. ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
  103. EncodeImpl<TVarIntTraits>(r, b);
  104. return 0;
  105. }
  106. void Decode(TStringBuf r, TBuffer& b) const override {
  107. DecodeImpl<TVarIntTraits>(r, b);
  108. }
  109. TString GetName() const override {
  110. return ToString(MyName());
  111. }
  112. protected:
  113. void DoLearn(ISequenceReader&) override;
  114. void Save(IOutputStream*) const override;
  115. void Load(IInputStream*) override;
  116. Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
  117. return TStringBuf(Pool.Data() + begoff, endoff - begoff);
  118. }
  119. Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
  120. return SubStr(Decoder[num - 1], Decoder[num]);
  121. }
  122. template <class TTraits>
  123. Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
  124. b.Clear();
  125. b.Reserve(r.size());
  126. while (!r.empty()) {
  127. size_t sz = 0;
  128. ui32 val = (ui32)-1;
  129. Encoder.FindLongestPrefix(r, &sz, &val);
  130. TTraits::Write(val + 1, b);
  131. r.Skip(Max<size_t>(sz, 1));
  132. }
  133. }
  134. template <class TTraits>
  135. Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
  136. b.Clear();
  137. b.Reserve(r.size());
  138. ui32 v = 0;
  139. while (!r.empty()) {
  140. TTraits::Read(r, v);
  141. TStringBuf s = DoDecode(v);
  142. b.Append(s.data(), s.size());
  143. }
  144. }
  145. inline bool CanUseShortInt() const {
  146. return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
  147. }
  148. private:
  149. typedef TCompactTrie<char, ui32> TEncoder;
  150. typedef TVector<ui32> TDecoder;
  151. TBuffer Pool;
  152. TEncoder Encoder;
  153. TDecoder Decoder;
  154. NGreedyDict::TBuildSettings Settings;
  155. ui32 MaxEntries;
  156. ui32 MaxIterations;
  157. };
  158. // Uses varints or shortints depending on the decoder size
  159. class TAdaptiveSolarCodec: public TSolarCodec {
  160. public:
  161. explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
  162. : TSolarCodec(maxentries, maxiter, s)
  163. {
  164. }
  165. ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
  166. if (CanUseShortInt()) {
  167. EncodeImpl<TShortIntTraits>(r, b);
  168. } else {
  169. EncodeImpl<TVarIntTraits>(r, b);
  170. }
  171. return 0;
  172. }
  173. void Decode(TStringBuf r, TBuffer& b) const override {
  174. if (CanUseShortInt()) {
  175. DecodeImpl<TShortIntTraits>(r, b);
  176. } else {
  177. DecodeImpl<TVarIntTraits>(r, b);
  178. }
  179. }
  180. TString GetName() const override {
  181. if (CanUseShortInt()) {
  182. return ToString(MyNameShortInt());
  183. } else {
  184. return ToString(MyName());
  185. }
  186. }
  187. };
  188. class TSolarCodecShortInt: public TSolarCodec {
  189. public:
  190. explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
  191. : TSolarCodec(maxentries, maxiter, s)
  192. {
  193. }
  194. ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
  195. EncodeImpl<TShortIntTraits>(r, b);
  196. return 0;
  197. }
  198. void Decode(TStringBuf r, TBuffer& b) const override {
  199. DecodeImpl<TShortIntTraits>(r, b);
  200. }
  201. TString GetName() const override {
  202. return ToString(MyNameShortInt());
  203. }
  204. protected:
  205. void Load(IInputStream* in) override {
  206. TSolarCodec::Load(in);
  207. Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
  208. }
  209. };
  210. }