comptrie_builder.h 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #pragma once
  2. #include "comptrie_packer.h"
  3. #include "minimize.h"
  4. #include "key_selector.h"
  5. #include <util/stream/file.h>
  6. // --------------------------------------------------------------------------------------
  7. // Data Builder
  8. // To build the data buffer, we first create an automaton in memory. The automaton
  9. // is created incrementally. It actually helps a lot to have the input data prefix-grouped
  10. // by key; otherwise, memory consumption becomes a tough issue.
  11. // NOTE: building and serializing the automaton may be lengthy, and takes lots of memory.
  12. // PREFIX_GROUPED means that if we, while constructing a trie, add to the builder two keys with the same prefix,
  13. // then all the keys that we add between these two also have the same prefix.
  14. // Actually in this mode the builder can accept even more freely ordered input,
  15. // but for input as above it is guaranteed to work.
  16. enum ECompactTrieBuilderFlags {
  17. CTBF_NONE = 0,
  18. CTBF_PREFIX_GROUPED = 1 << 0,
  19. CTBF_VERBOSE = 1 << 1,
  20. CTBF_UNIQUE = 1 << 2,
  21. };
  22. using TCompactTrieBuilderFlags = ECompactTrieBuilderFlags;
  23. inline TCompactTrieBuilderFlags operator|(TCompactTrieBuilderFlags first, TCompactTrieBuilderFlags second) {
  24. return static_cast<TCompactTrieBuilderFlags>(static_cast<int>(first) | second);
  25. }
  26. inline TCompactTrieBuilderFlags& operator|=(TCompactTrieBuilderFlags& first, TCompactTrieBuilderFlags second) {
  27. return first = first | second;
  28. }
  29. template <typename T>
  30. class TArrayWithSizeHolder;
  31. template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
  32. class TCompactTrieBuilder {
  33. public:
  34. typedef T TSymbol;
  35. typedef D TData;
  36. typedef S TPacker;
  37. typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
  38. typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
  39. explicit TCompactTrieBuilder(TCompactTrieBuilderFlags flags = CTBF_NONE, TPacker packer = TPacker(), IAllocator* alloc = TDefaultAllocator::Instance());
  40. // All Add.. methods return true if it was a new key, false if the key already existed.
  41. bool Add(const TSymbol* key, size_t keylen, const TData& value);
  42. bool Add(const TKeyBuf& key, const TData& value) {
  43. return Add(key.data(), key.size(), value);
  44. }
  45. // add already serialized data
  46. bool AddPtr(const TSymbol* key, size_t keylen, const char* data);
  47. bool AddPtr(const TKeyBuf& key, const char* data) {
  48. return AddPtr(key.data(), key.size(), data);
  49. }
  50. bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& filename);
  51. bool AddSubtreeInFile(const TKeyBuf& key, const TString& filename) {
  52. return AddSubtreeInFile(key.data(), key.size(), filename);
  53. }
  54. bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer);
  55. bool AddSubtreeInBuffer(const TKeyBuf& key, TArrayWithSizeHolder<char>&& buffer) {
  56. return AddSubtreeInBuffer(key.data(), key.size(), std::move(buffer));
  57. }
  58. bool Find(const TSymbol* key, size_t keylen, TData* value) const;
  59. bool Find(const TKeyBuf& key, TData* value = nullptr) const {
  60. return Find(key.data(), key.size(), value);
  61. }
  62. bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr) const;
  63. bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr) const {
  64. return FindLongestPrefix(key.data(), key.size(), prefixLen, value);
  65. }
  66. size_t Save(IOutputStream& os) const;
  67. size_t SaveAndDestroy(IOutputStream& os);
  68. size_t SaveToFile(const TString& fileName) const {
  69. TFixedBufferFileOutput out(fileName);
  70. return Save(out);
  71. }
  72. void Clear(); // Returns all memory to the system and resets the builder state.
  73. size_t GetEntryCount() const;
  74. size_t GetNodeCount() const;
  75. // Exact output file size in bytes.
  76. size_t MeasureByteSize() const {
  77. return Impl->MeasureByteSize();
  78. }
  79. protected:
  80. class TCompactTrieBuilderImpl;
  81. THolder<TCompactTrieBuilderImpl> Impl;
  82. };
  83. //----------------------------------------------------------------------------------------------------------------------
  84. // Minimize the trie. The result is equivalent to the original
  85. // trie, except that it takes less space (and has marginally lower
  86. // performance, because of eventual epsilon links).
  87. // The algorithm is as follows: starting from the largest pieces, we find
  88. // nodes that have identical continuations (Daciuk's right language),
  89. // and repack the trie. Repacking is done in-place, so memory is less
  90. // of an issue; however, it may take considerable time.
  91. // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
  92. // Because of non-local structure and epsilon links, it won't work
  93. // as you expect it to, and can destroy the trie in the making.
  94. // If you want both minimization and fast layout, do the minimization first.
  95. template <class TPacker>
  96. size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker(), NCompactTrie::EMinimizeMode mode = NCompactTrie::MM_DEFAULT);
  97. template <class TTrieBuilder>
  98. size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
  99. //----------------------------------------------------------------------------------------------------------------
  100. // Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
  101. // The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
  102. // Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
  103. // Calling it the second time on the same trie does nothing.
  104. //
  105. // The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
  106. // by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
  107. // two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
  108. // The original paper (describing the layout in Section 2.1) is:
  109. // Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees
  110. // SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358.
  111. // Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
  112. // Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees
  113. // Proceedings of the 41st Annual Symposium
  114. // on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409.
  115. // Available on the web: http://erikdemaine.org/papers/FOCS2000b/
  116. // (there is not much difference between these papers, actually).
  117. //
  118. template <class TPacker>
  119. size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());
  120. template <class TTrieBuilder>
  121. size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
  122. // Composition of minimization and fast layout
  123. template <class TPacker>
  124. size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());
  125. template <class TTrieBuilder>
  126. size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
  127. // Implementation details moved here.
  128. #include "comptrie_builder.inl"