codecs.h 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. #pragma once
  2. #include "sample.h"
  3. #include <util/generic/bt_exception.h>
  4. #include <util/generic/hash.h>
  5. #include <util/generic/ptr.h>
  6. #include <util/generic/singleton.h>
  7. #include <util/stream/input.h>
  8. #include <util/stream/output.h>
  9. #include <util/string/cast.h>
  10. #include <util/string/vector.h>
  11. #include <util/system/tls.h>
  12. #include <util/ysaveload.h>
  13. namespace NCodecs {
  14. class TCodecException: public TWithBackTrace<yexception> {};
  15. class ICodec;
  16. using TCodecPtr = TIntrusivePtr<ICodec>;
  17. using TCodecConstPtr = TIntrusiveConstPtr<ICodec>;
  18. struct TCodecTraits {
  19. ui32 RecommendedSampleSize = 0;
  20. ui16 SizeOfInputElement = 1;
  21. ui8 SizeOnEncodeMultiplier = 1;
  22. ui8 SizeOnEncodeAddition = 0;
  23. ui8 SizeOnDecodeMultiplier = 1;
  24. bool NeedsTraining = false;
  25. bool PreservesPrefixGrouping = false;
  26. bool Irreversible = false;
  27. bool PaddingBit = 0;
  28. bool AssumesStructuredInput = false;
  29. size_t ApproximateSizeOnEncode(size_t sz) const {
  30. return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition;
  31. }
  32. size_t ApproximateSizeOnDecode(size_t sz) const {
  33. return sz * SizeOnDecodeMultiplier;
  34. }
  35. };
  36. class ICodec: public TAtomicRefCount<ICodec> {
  37. protected:
  38. bool Trained = false;
  39. TCodecTraits MyTraits;
  40. public:
  41. TCodecTraits Traits() const {
  42. return MyTraits;
  43. }
  44. // the name of the codec (or its variant) to be used in the codec registry
  45. virtual TString GetName() const = 0;
  46. virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0;
  47. virtual ui8 Encode(const TBuffer& input, TBuffer& output) const {
  48. return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
  49. }
  50. virtual void Decode(TStringBuf, TBuffer&) const = 0;
  51. virtual void Decode(const TBuffer& input, TBuffer& output) const {
  52. Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
  53. }
  54. virtual ~ICodec() = default;
  55. virtual bool AlreadyTrained() const {
  56. return !Traits().NeedsTraining || Trained;
  57. }
  58. virtual void SetTrained(bool t) {
  59. Trained = t;
  60. }
  61. bool TryToLearn(ISequenceReader& r) {
  62. Trained = DoTryToLearn(r);
  63. return Trained;
  64. }
  65. void Learn(ISequenceReader& r) {
  66. LearnX(r, 1);
  67. }
  68. template <class TIter>
  69. void Learn(TIter beg, TIter end) {
  70. Learn(beg, end, IterToStringBuf<TIter>);
  71. }
  72. template <class TIter, class TGetter>
  73. void Learn(TIter beg, TIter end, TGetter getter) {
  74. auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter);
  75. TSimpleSequenceReader<TBuffer> reader{sample};
  76. Learn(reader);
  77. }
  78. static TCodecPtr GetInstance(TStringBuf name);
  79. static TVector<TString> GetCodecsList();
  80. static TString GetNameSafe(TCodecPtr p);
  81. static void Store(IOutputStream* out, TCodecPtr p);
  82. static TCodecPtr Restore(IInputStream* in);
  83. static TCodecPtr RestoreFromString(TStringBuf);
  84. protected:
  85. virtual void DoLearn(ISequenceReader&) = 0;
  86. virtual bool DoTryToLearn(ISequenceReader& r) {
  87. DoLearn(r);
  88. return true;
  89. }
  90. // so the pipeline codec will know to adjust the sample for the subcodecs
  91. virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) {
  92. DoLearn(r);
  93. }
  94. virtual void Save(IOutputStream*) const {
  95. }
  96. virtual void Load(IInputStream*) {
  97. }
  98. friend class TPipelineCodec;
  99. public:
  100. // so the pipeline codec will know to adjust the sample for the subcodecs
  101. void LearnX(ISequenceReader& r, double sampleSizeMult) {
  102. DoLearnX(r, sampleSizeMult);
  103. Trained = true;
  104. }
  105. template <class TIter>
  106. void LearnX(TIter beg, TIter end, double sampleSizeMult) {
  107. auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult);
  108. TSimpleSequenceReader<TBuffer> reader{sample};
  109. LearnX(reader, sampleSizeMult);
  110. }
  111. };
  112. class TBasicTrivialCodec: public ICodec {
  113. public:
  114. ui8 Encode(TStringBuf in, TBuffer& out) const override {
  115. out.Assign(in.data(), in.size());
  116. return 0;
  117. }
  118. void Decode(TStringBuf in, TBuffer& out) const override {
  119. Encode(in, out);
  120. }
  121. protected:
  122. void DoLearn(ISequenceReader&) override {
  123. }
  124. };
  125. class TTrivialCodec: public TBasicTrivialCodec {
  126. public:
  127. TTrivialCodec() {
  128. MyTraits.PreservesPrefixGrouping = true;
  129. }
  130. static TStringBuf MyName() {
  131. return "trivial";
  132. }
  133. TString GetName() const override {
  134. return ToString(MyName());
  135. }
  136. };
  137. class TTrivialTrainableCodec: public TBasicTrivialCodec {
  138. public:
  139. TTrivialTrainableCodec() {
  140. MyTraits.PreservesPrefixGrouping = true;
  141. MyTraits.NeedsTraining = true;
  142. }
  143. static TStringBuf MyName() {
  144. return "trivial-trainable";
  145. }
  146. TString GetName() const override {
  147. return ToString(MyName());
  148. }
  149. };
  150. class TNullCodec: public ICodec {
  151. public:
  152. TNullCodec() {
  153. MyTraits.Irreversible = true;
  154. MyTraits.SizeOnDecodeMultiplier = 0;
  155. MyTraits.SizeOnEncodeMultiplier = 0;
  156. }
  157. TString GetName() const override {
  158. return "null";
  159. }
  160. ui8 Encode(TStringBuf, TBuffer& out) const override {
  161. out.Clear();
  162. return 0;
  163. }
  164. void Decode(TStringBuf, TBuffer& out) const override {
  165. out.Clear();
  166. }
  167. protected:
  168. void DoLearn(ISequenceReader&) override {
  169. }
  170. };
  171. class TPipelineCodec: public ICodec {
  172. typedef TVector<TCodecPtr> TPipeline;
  173. TPipeline Pipeline;
  174. TString MyName;
  175. public:
  176. explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) {
  177. MyTraits.PreservesPrefixGrouping = true;
  178. AddCodec(c0);
  179. AddCodec(c1);
  180. AddCodec(c2);
  181. AddCodec(c3);
  182. }
  183. TString GetName() const override {
  184. return MyName;
  185. }
  186. ui8 Encode(TStringBuf in, TBuffer& out) const override;
  187. void Decode(TStringBuf in, TBuffer& out) const override;
  188. public:
  189. /*
  190. * Add codecs in the following order:
  191. * uncompressed -> codec0 | codec1 | ... | codecN -> compressed
  192. */
  193. TPipelineCodec& AddCodec(TCodecPtr codec);
  194. bool AlreadyTrained() const override;
  195. void SetTrained(bool t) override;
  196. protected:
  197. void DoLearn(ISequenceReader& in) override {
  198. DoLearnX(in, 1);
  199. }
  200. void DoLearnX(ISequenceReader& in, double sampleSizeMult) override;
  201. void Save(IOutputStream* out) const override;
  202. void Load(IInputStream* in) override;
  203. };
  204. }