solar_codec.cpp 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. #include "solar_codec.h"
  2. #include <library/cpp/codecs/greedy_dict/gd_builder.h>
  3. #include <library/cpp/containers/comptrie/comptrie_builder.h>
  4. #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
  5. #include <util/stream/length.h>
  6. #include <util/string/printf.h>
  7. #include <util/ysaveload.h>
  8. namespace NCodecs {
  9. static inline ui32 Append(TBuffer& pool, TStringBuf data) {
  10. pool.Append(data.data(), data.size());
  11. return pool.Size();
  12. }
  13. void TSolarCodec::DoLearn(ISequenceReader& r) {
  14. using namespace NGreedyDict;
  15. Decoder.clear();
  16. Pool.Clear();
  17. THolder<TEntrySet> set;
  18. {
  19. TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
  20. TStringBufs bufs;
  21. TStringBuf m;
  22. while (r.NextRegion(m)) {
  23. bufs.push_back(pool.AppendString(m));
  24. }
  25. {
  26. TDictBuilder b(Settings);
  27. b.SetInput(bufs);
  28. b.Build(MaxEntries, MaxIterations);
  29. set = b.ReleaseEntrySet();
  30. }
  31. }
  32. set->SetScores(ES_LEN_COUNT);
  33. {
  34. TVector<std::pair<float, TStringBuf>> tmp;
  35. tmp.reserve(set->size());
  36. for (const auto& it : *set) {
  37. tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
  38. }
  39. Sort(tmp.begin(), tmp.end());
  40. Decoder.reserve(tmp.size() + 1);
  41. Decoder.push_back(0);
  42. for (const auto& it : tmp) {
  43. Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
  44. ui32 endoff = Append(Pool, it.second);
  45. Decoder.push_back(endoff);
  46. }
  47. }
  48. Pool.ShrinkToFit();
  49. Decoder.shrink_to_fit();
  50. TBufferOutput bout;
  51. {
  52. TVector<std::pair<TStringBuf, ui32>> tmp2;
  53. tmp2.reserve(Decoder.size());
  54. for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
  55. TStringBuf s = DoDecode(i);
  56. tmp2.push_back(std::make_pair(s, i - 1));
  57. Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");
  58. }
  59. Sort(tmp2.begin(), tmp2.end());
  60. {
  61. TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
  62. for (const auto& it : tmp2) {
  63. builder.Add(it.first.data(), it.first.size(), it.second);
  64. }
  65. builder.Save(bout);
  66. }
  67. }
  68. Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
  69. }
  70. void TSolarCodec::Save(IOutputStream* out) const {
  71. TBlob b = Encoder.Data();
  72. ::Save(out, (ui32)b.Size());
  73. out->Write(b.Data(), b.Size());
  74. }
  75. void TSolarCodec::Load(IInputStream* in) {
  76. ui32 sz;
  77. ::Load(in, sz);
  78. TLengthLimitedInput lin(in, sz);
  79. Encoder.Init(TBlob::FromStream(lin));
  80. Pool.Clear();
  81. Decoder.clear();
  82. TVector<std::pair<ui32, TString>> tmp;
  83. ui32 poolsz = 0;
  84. for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
  85. const TString& s = it.GetKey();
  86. tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
  87. poolsz += Max<ui32>(s.size(), 1);
  88. }
  89. Sort(tmp.begin(), tmp.end());
  90. Pool.Reserve(poolsz);
  91. Decoder.reserve(tmp.size() + 1);
  92. Decoder.push_back(0);
  93. for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
  94. Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
  95. Decoder.push_back(Append(Pool, tmp[i].second));
  96. }
  97. Pool.ShrinkToFit();
  98. Decoder.shrink_to_fit();
  99. }
  100. }