gd_entry.h 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #pragma once
  2. #include "gd_stats.h"
  3. #include <library/cpp/containers/comptrie/comptrie.h>
  4. #include <util/generic/ptr.h>
  5. #include <util/generic/strbuf.h>
  6. #include <util/generic/vector.h>
  7. #include <util/memory/pool.h>
  8. namespace NGreedyDict {
  9. using TStringBufs = TVector<TStringBuf>;
  10. struct TEntry {
  11. static const i32 NoPrefix = -1;
  12. TStringBuf Str;
  13. i32 NearestPrefix = NoPrefix;
  14. ui32 Count = 0;
  15. ui32 Number = 0;
  16. float ModelP = 0;
  17. float Score = 0;
  18. TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0)
  19. : Str(b)
  20. , Count(cnt)
  21. {
  22. }
  23. bool HasPrefix() const {
  24. return NearestPrefix != NoPrefix;
  25. }
  26. ui32 Len() const {
  27. return Str.size();
  28. }
  29. static bool StrLess(const TEntry& a, const TEntry& b) {
  30. return a.Str < b.Str;
  31. }
  32. static bool NumberLess(const TEntry& a, const TEntry& b) {
  33. return a.Number < b.Number;
  34. }
  35. static bool ScoreMore(const TEntry& a, const TEntry& b) {
  36. return a.Score > b.Score;
  37. }
  38. };
  39. class TEntrySet: public TVector<TEntry>, TNonCopyable {
  40. TMemoryPool Pool{8112};
  41. TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie;
  42. public:
  43. ui32 TotalCount = 0;
  44. void InitWithAlpha();
  45. void Add(TStringBuf a) {
  46. push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size()));
  47. }
  48. void Add(TStringBuf a, TStringBuf b) {
  49. size_t sz = a.size() + b.size();
  50. char* p = (char*)Pool.Allocate(sz);
  51. memcpy(p, a.data(), a.size());
  52. memcpy(p + a.size(), b.data(), b.size());
  53. push_back(TStringBuf(p, sz));
  54. }
  55. TEntry& Get(ui32 idx) {
  56. return (*this)[idx];
  57. }
  58. const TEntry& Get(ui32 idx) const {
  59. return (*this)[idx];
  60. }
  61. void BuildHierarchy();
  62. // longest prefix
  63. TEntry* FindPrefix(TStringBuf& str);
  64. const TEntry* FindPrefix(TStringBuf& str) const {
  65. return ((TEntrySet*)this)->FindPrefix(str);
  66. }
  67. const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) {
  68. if (!e.HasPrefix())
  69. return nullptr;
  70. const TEntry& p = Get(e.NearestPrefix);
  71. suff = e.Str;
  72. suff.Skip(p.Str.size());
  73. return &p;
  74. }
  75. void SetModelP();
  76. void SetScores(EEntryScore);
  77. };
  78. }