123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- #pragma once
- #include "comptrie_packer.h"
- #include "minimize.h"
- #include "key_selector.h"
- #include <util/stream/file.h>
- // --------------------------------------------------------------------------------------
- // Data Builder
- // To build the data buffer, we first create an automaton in memory. The automaton
- // is created incrementally. It actually helps a lot to have the input data prefix-grouped
- // by key; otherwise, memory consumption becomes a tough issue.
- // NOTE: building and serializing the automaton may be lengthy, and takes lots of memory.
- // PREFIX_GROUPED means that if we, while constructing a trie, add to the builder two keys with the same prefix,
- // then all the keys that we add between these two also have the same prefix.
- // Actually in this mode the builder can accept even more freely ordered input,
- // but for input as above it is guaranteed to work.
- enum ECompactTrieBuilderFlags {
- CTBF_NONE = 0,
- CTBF_PREFIX_GROUPED = 1 << 0,
- CTBF_VERBOSE = 1 << 1,
- CTBF_UNIQUE = 1 << 2,
- };
- using TCompactTrieBuilderFlags = ECompactTrieBuilderFlags;
- inline TCompactTrieBuilderFlags operator|(TCompactTrieBuilderFlags first, TCompactTrieBuilderFlags second) {
- return static_cast<TCompactTrieBuilderFlags>(static_cast<int>(first) | second);
- }
- inline TCompactTrieBuilderFlags& operator|=(TCompactTrieBuilderFlags& first, TCompactTrieBuilderFlags second) {
- return first = first | second;
- }
- template <typename T>
- class TArrayWithSizeHolder;
- template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
- class TCompactTrieBuilder {
- public:
- typedef T TSymbol;
- typedef D TData;
- typedef S TPacker;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
- explicit TCompactTrieBuilder(TCompactTrieBuilderFlags flags = CTBF_NONE, TPacker packer = TPacker(), IAllocator* alloc = TDefaultAllocator::Instance());
- // All Add.. methods return true if it was a new key, false if the key already existed.
- bool Add(const TSymbol* key, size_t keylen, const TData& value);
- bool Add(const TKeyBuf& key, const TData& value) {
- return Add(key.data(), key.size(), value);
- }
- // add already serialized data
- bool AddPtr(const TSymbol* key, size_t keylen, const char* data);
- bool AddPtr(const TKeyBuf& key, const char* data) {
- return AddPtr(key.data(), key.size(), data);
- }
- bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& filename);
- bool AddSubtreeInFile(const TKeyBuf& key, const TString& filename) {
- return AddSubtreeInFile(key.data(), key.size(), filename);
- }
- bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer);
- bool AddSubtreeInBuffer(const TKeyBuf& key, TArrayWithSizeHolder<char>&& buffer) {
- return AddSubtreeInBuffer(key.data(), key.size(), std::move(buffer));
- }
- bool Find(const TSymbol* key, size_t keylen, TData* value) const;
- bool Find(const TKeyBuf& key, TData* value = nullptr) const {
- return Find(key.data(), key.size(), value);
- }
- bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr) const;
- bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr) const {
- return FindLongestPrefix(key.data(), key.size(), prefixLen, value);
- }
- size_t Save(IOutputStream& os) const;
- size_t SaveAndDestroy(IOutputStream& os);
- size_t SaveToFile(const TString& fileName) const {
- TFixedBufferFileOutput out(fileName);
- return Save(out);
- }
- void Clear(); // Returns all memory to the system and resets the builder state.
- size_t GetEntryCount() const;
- size_t GetNodeCount() const;
- // Exact output file size in bytes.
- size_t MeasureByteSize() const {
- return Impl->MeasureByteSize();
- }
- protected:
- class TCompactTrieBuilderImpl;
- THolder<TCompactTrieBuilderImpl> Impl;
- };
- //----------------------------------------------------------------------------------------------------------------------
- // Minimize the trie. The result is equivalent to the original
- // trie, except that it takes less space (and has marginally lower
- // performance, because of eventual epsilon links).
- // The algorithm is as follows: starting from the largest pieces, we find
- // nodes that have identical continuations (Daciuk's right language),
- // and repack the trie. Repacking is done in-place, so memory is less
- // of an issue; however, it may take considerable time.
- // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
- // Because of non-local structure and epsilon links, it won't work
- // as you expect it to, and can destroy the trie in the making.
- // If you want both minimization and fast layout, do the minimization first.
- template <class TPacker>
- size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker(), NCompactTrie::EMinimizeMode mode = NCompactTrie::MM_DEFAULT);
- template <class TTrieBuilder>
- size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
- //----------------------------------------------------------------------------------------------------------------
- // Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
- // The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
- // Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
- // Calling it the second time on the same trie does nothing.
- //
- // The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
- // by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
- // two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
- // The original paper (describing the layout in Section 2.1) is:
- // Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees
- // SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358.
- // Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
- // Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees
- // Proceedings of the 41st Annual Symposium
- // on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409.
- // Available on the web: http://erikdemaine.org/papers/FOCS2000b/
- // (there is not much difference between these papers, actually).
- //
- template <class TPacker>
- size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());
- template <class TTrieBuilder>
- size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
- // Composition of minimization and fast layout
- template <class TPacker>
- size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());
- template <class TTrieBuilder>
- size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
- // Implementation details moved here.
- #include "comptrie_builder.inl"
|