gd_entry.cpp 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #include "gd_entry.h"
  2. #include "gd_stats.h"
  3. #include <util/generic/algorithm.h>
  4. #include <util/generic/singleton.h>
  5. namespace NGreedyDict {
  6. class TAlphas {
  7. char Memory[512];
  8. public:
  9. TStringBufs Alphas;
  10. TAlphas() {
  11. for (ui32 i = 0; i < 256; ++i) {
  12. Memory[2 * i] = (char)i;
  13. Memory[2 * i + 1] = 0;
  14. Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
  15. }
  16. }
  17. };
  18. void TEntrySet::InitWithAlpha() {
  19. Pool.ClearKeepFirstChunk();
  20. const TStringBufs& a = Singleton<TAlphas>()->Alphas;
  21. for (auto it : a) {
  22. Add(it);
  23. }
  24. BuildHierarchy();
  25. }
  26. void TEntrySet::BuildHierarchy() {
  27. Sort(begin(), end(), TEntry::StrLess);
  28. TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
  29. for (iterator it = begin(); it != end(); ++it) {
  30. it->Number = (it - begin());
  31. TStringBuf suff = it->Str;
  32. size_t len = 0;
  33. ui32 val = 0;
  34. if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {
  35. it->NearestPrefix = val;
  36. }
  37. builder.Add(suff.data(), suff.size(), it->Number);
  38. }
  39. TBufferOutput bout;
  40. builder.Save(bout);
  41. Trie.Init(TBlob::FromBuffer(bout.Buffer()));
  42. }
  43. TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
  44. size_t len = 0;
  45. ui32 off = 0;
  46. if (!Trie.FindLongestPrefix(str, &len, &off)) {
  47. return nullptr;
  48. }
  49. str.Skip(len);
  50. return &Get(off);
  51. }
  52. void TEntrySet::SetModelP() {
  53. for (iterator it = begin(); it != end(); ++it) {
  54. TEntry& e = *it;
  55. if (!e.HasPrefix()) {
  56. e.ModelP = 0;
  57. continue;
  58. }
  59. TStringBuf suff = e.Str;
  60. const TEntry& p = Get(e.NearestPrefix);
  61. suff.Skip(p.Len());
  62. float modelp = float(p.Count + e.Count) / TotalCount;
  63. while (!!suff) {
  64. TEntry* pp = FindPrefix(suff);
  65. modelp *= float(pp->Count + e.Count) / TotalCount;
  66. }
  67. e.ModelP = modelp;
  68. }
  69. }
  70. void TEntrySet::SetScores(EEntryScore s) {
  71. for (auto& it : *this) {
  72. it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
  73. }
  74. }
  75. }