gd_builder.h 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #pragma once
  2. #include "gd_entry.h"
  3. #include <util/generic/hash.h>
  4. #include <util/random/fast.h>
  5. namespace NGreedyDict {
  6. struct TBuildSettings {
  7. EEntryStatTest StatTest = EST_SIMPLE_NORM;
  8. EEntryScore Score = ES_LEN_SIMPLE;
  9. float MinPValue = 0.75;
  10. ui32 MinAbsCount = 10;
  11. ui32 GrowLimit = 10; // times of maxentries
  12. bool Verbose = false;
  13. };
  14. class TDictBuilder {
  15. using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
  16. using TCandidate = std::pair<float, ui64>;
  17. using TCandidates = TVector<TCandidate>;
  18. private:
  19. TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
  20. TStringBufs Input;
  21. THolder<TEntrySet> Current;
  22. TMemoryPool CompoundCountsPool;
  23. THolder<TCompoundCounts> CompoundCounts;
  24. TCandidates Candidates;
  25. TBuildSettings Settings;
  26. public:
  27. TDictBuilder(const TBuildSettings& s = TBuildSettings())
  28. : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
  29. , Settings(s)
  30. {
  31. }
  32. void SetInput(const TStringBufs& in) {
  33. Input = in;
  34. }
  35. const TBuildSettings& GetSettings() const {
  36. return Settings;
  37. }
  38. TBuildSettings& GetSettings() {
  39. return Settings;
  40. }
  41. void SetSettings(const TBuildSettings& s) {
  42. Settings = s;
  43. }
  44. TEntrySet& EntrySet() {
  45. return *Current;
  46. }
  47. const TEntrySet& EntrySet() const {
  48. return *Current;
  49. }
  50. THolder<TEntrySet> ReleaseEntrySet() {
  51. return std::move(Current);
  52. }
  53. ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 maxlen = -1, ui32 mindiff = 10);
  54. public:
  55. void RebuildCounts(ui32 maxcand, bool final);
  56. ui32 /*diff size*/ BuildNextGeneration(ui32 maxent, ui32 maxlen);
  57. static bool IsCompound(ui64 ent) {
  58. return ent & 0xFFFFFFFF00000000ULL;
  59. }
  60. static ui32 Next(ui64 ent) {
  61. return ent;
  62. }
  63. static ui32 Prev(ui64 ent) {
  64. return (ent >> 32) - 1;
  65. }
  66. static ui64 Compose(ui32 prev, ui32 next) {
  67. return ((prev + 1ULL) << 32) | next;
  68. }
  69. };
  70. }