codecs.cpp 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #include "codecs.h"
  2. #include "tls_cache.h"
  3. #include <util/stream/mem.h>
  4. namespace NCodecs {
  5. void ICodec::Store(IOutputStream* out, TCodecPtr p) {
  6. if (!p.Get()) {
  7. ::Save(out, (ui16)0);
  8. return;
  9. }
  10. Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName());
  11. const TString& n = p->GetName();
  12. Y_ABORT_UNLESS(n.size() <= Max<ui16>());
  13. ::Save(out, (ui16)n.size());
  14. out->Write(n.data(), n.size());
  15. p->Save(out);
  16. }
  17. TCodecPtr ICodec::Restore(IInputStream* in) {
  18. ui16 l = 0;
  19. ::Load(in, l);
  20. if (!l) {
  21. return nullptr;
  22. }
  23. TString n;
  24. n.resize(l);
  25. Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException());
  26. TCodecPtr p = ICodec::GetInstance(n);
  27. p->Load(in);
  28. p->Trained = true;
  29. return p;
  30. }
  31. TCodecPtr ICodec::RestoreFromString(TStringBuf s) {
  32. TMemoryInput minp{s.data(), s.size()};
  33. return Restore(&minp);
  34. }
  35. TString ICodec::GetNameSafe(TCodecPtr p) {
  36. return !p ? TString("none") : p->GetName();
  37. }
  38. ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const {
  39. size_t res = Traits().ApproximateSizeOnEncode(in.size());
  40. out.Reserve(res);
  41. out.Clear();
  42. if (Pipeline.empty()) {
  43. out.Append(in.data(), in.size());
  44. return 0;
  45. } else if (Pipeline.size() == 1) {
  46. return Pipeline.front()->Encode(in, out);
  47. }
  48. ui8 freelastbits = 0;
  49. auto buffer = TBufferTlsCache::TlsInstance().Item();
  50. TBuffer& tmp = buffer.Get();
  51. tmp.Reserve(res);
  52. for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) {
  53. if (it != Pipeline.begin()) {
  54. tmp.Clear();
  55. tmp.Swap(out);
  56. in = TStringBuf{tmp.data(), tmp.size()};
  57. }
  58. freelastbits = (*it)->Encode(in, out);
  59. }
  60. return freelastbits;
  61. }
  62. void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const {
  63. size_t res = Traits().ApproximateSizeOnDecode(in.size());
  64. out.Reserve(res);
  65. out.Clear();
  66. if (Pipeline.empty()) {
  67. out.Append(in.data(), in.size());
  68. return;
  69. } else if (Pipeline.size() == 1) {
  70. Pipeline.front()->Decode(in, out);
  71. return;
  72. }
  73. auto buffer = TBufferTlsCache::TlsInstance().Item();
  74. TBuffer& tmp = buffer.Get();
  75. tmp.Reserve(res);
  76. for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) {
  77. if (it != Pipeline.rbegin()) {
  78. tmp.Clear();
  79. tmp.Swap(out);
  80. in = TStringBuf{tmp.data(), tmp.size()};
  81. }
  82. (*it)->Decode(in, out);
  83. }
  84. }
  85. void TPipelineCodec::Save(IOutputStream* out) const {
  86. for (const auto& it : Pipeline)
  87. it->Save(out);
  88. }
  89. void TPipelineCodec::Load(IInputStream* in) {
  90. for (const auto& it : Pipeline) {
  91. it->Load(in);
  92. it->SetTrained(true);
  93. }
  94. }
  95. void TPipelineCodec::SetTrained(bool t) {
  96. for (const auto& it : Pipeline) {
  97. it->SetTrained(t);
  98. }
  99. }
  100. TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) {
  101. if (!codec)
  102. return *this;
  103. TCodecTraits tr = codec->Traits();
  104. if (!MyName) {
  105. MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput;
  106. MyTraits.SizeOfInputElement = tr.SizeOfInputElement;
  107. } else {
  108. MyName.append(':');
  109. }
  110. MyName.append(codec->GetName());
  111. MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping;
  112. MyTraits.PaddingBit = tr.PaddingBit;
  113. MyTraits.NeedsTraining |= tr.NeedsTraining;
  114. MyTraits.Irreversible |= tr.Irreversible;
  115. MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition;
  116. MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier;
  117. MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier;
  118. MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize);
  119. Pipeline.push_back(codec);
  120. return *this;
  121. }
  122. void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) {
  123. if (!Traits().NeedsTraining) {
  124. return;
  125. }
  126. if (Pipeline.size() == 1) {
  127. Pipeline.back()->Learn(in);
  128. return;
  129. }
  130. TVector<TBuffer> trainingInput;
  131. TStringBuf r;
  132. while (in.NextRegion(r)) {
  133. trainingInput.emplace_back(r.data(), r.size());
  134. }
  135. TBuffer buff;
  136. for (const auto& it : Pipeline) {
  137. it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult);
  138. for (auto& bit : trainingInput) {
  139. buff.Clear();
  140. it->Encode(TStringBuf{bit.data(), bit.size()}, buff);
  141. buff.Swap(bit);
  142. }
  143. }
  144. }
  145. bool TPipelineCodec::AlreadyTrained() const {
  146. for (const auto& it : Pipeline) {
  147. if (!it->AlreadyTrained())
  148. return false;
  149. }
  150. return true;
  151. }
  152. }