solar_codec.cpp 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #include "solar_codec.h"
  2. #include <library/cpp/codecs/greedy_dict/gd_builder.h>
  3. #include <library/cpp/containers/comptrie/comptrie_builder.h>
  4. #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
  5. #include <util/stream/length.h>
  6. #include <util/string/printf.h>
  7. #include <util/ysaveload.h>
  8. namespace NCodecs {
  9. static inline ui32 Append(TBuffer& pool, TStringBuf data) {
  10. pool.Append(data.data(), data.size());
  11. return pool.Size();
  12. }
  13. void TSolarCodec::DoLearn(ISequenceReader& r) {
  14. using namespace NGreedyDict;
  15. const ui32 maxlen = Max<ui32>() / Max<ui32>(MaxEntries, 1);
  16. Decoder.clear();
  17. Pool.Clear();
  18. THolder<TEntrySet> set;
  19. {
  20. TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
  21. TStringBufs bufs;
  22. TStringBuf m;
  23. while (r.NextRegion(m)) {
  24. bufs.push_back(pool.AppendString(m));
  25. }
  26. {
  27. TDictBuilder b(Settings);
  28. b.SetInput(bufs);
  29. b.Build(MaxEntries, MaxIterations, maxlen);
  30. set = b.ReleaseEntrySet();
  31. }
  32. }
  33. set->SetScores(ES_LEN_COUNT);
  34. {
  35. TVector<std::pair<float, TStringBuf>> tmp;
  36. tmp.reserve(set->size());
  37. for (const auto& it : *set) {
  38. Y_ENSURE(it.Str.size() <= maxlen);
  39. tmp.push_back(std::make_pair(-it.Score, it.Str));
  40. }
  41. Sort(tmp.begin(), tmp.end());
  42. Decoder.reserve(tmp.size() + 1);
  43. Decoder.push_back(0);
  44. for (const auto& it : tmp) {
  45. Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
  46. ui32 endoff = Append(Pool, it.second);
  47. Decoder.push_back(endoff);
  48. }
  49. }
  50. Pool.ShrinkToFit();
  51. Decoder.shrink_to_fit();
  52. TBufferOutput bout;
  53. {
  54. TVector<std::pair<TStringBuf, ui32>> tmp2;
  55. tmp2.reserve(Decoder.size());
  56. for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
  57. TStringBuf s = DoDecode(i);
  58. tmp2.push_back(std::make_pair(s, i - 1));
  59. Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");
  60. }
  61. Sort(tmp2.begin(), tmp2.end());
  62. {
  63. TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
  64. for (const auto& it : tmp2) {
  65. builder.Add(it.first.data(), it.first.size(), it.second);
  66. }
  67. builder.Save(bout);
  68. }
  69. }
  70. Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
  71. }
  72. void TSolarCodec::Save(IOutputStream* out) const {
  73. TBlob b = Encoder.Data();
  74. ::Save(out, (ui32)b.Size());
  75. out->Write(b.Data(), b.Size());
  76. }
  77. void TSolarCodec::Load(IInputStream* in) {
  78. ui32 sz;
  79. ::Load(in, sz);
  80. TLengthLimitedInput lin(in, sz);
  81. Encoder.Init(TBlob::FromStream(lin));
  82. Pool.Clear();
  83. Decoder.clear();
  84. TVector<std::pair<ui32, TString>> tmp;
  85. ui32 poolsz = 0;
  86. for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
  87. const TString& s = it.GetKey();
  88. tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
  89. poolsz += Max<ui32>(s.size(), 1);
  90. }
  91. Sort(tmp.begin(), tmp.end());
  92. Pool.Reserve(poolsz);
  93. Decoder.reserve(tmp.size() + 1);
  94. Decoder.push_back(0);
  95. for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
  96. Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
  97. Decoder.push_back(Append(Pool, tmp[i].second));
  98. }
  99. Pool.ShrinkToFit();
  100. Decoder.shrink_to_fit();
  101. }
  102. }