#include "minimize.h" #include "node.h" #include "writeable_node.h" #include "write_trie_backwards.h" #include "comptrie_impl.h" #include #include namespace NCompactTrie { // Minimize the trie. The result is equivalent to the original // trie, except that it takes less space (and has marginally lower // performance, because of eventual epsilon links). // The algorithm is as follows: starting from the largest pieces, we find // nodes that have identical continuations (Daciuk's right language), // and repack the trie. Repacking is done in-place, so memory is less // of an issue; however, it may take considerable time. // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. // Because of non-local structure and epsilon links, it won't work // as you expect it to, and can destroy the trie in the making. namespace { using TOffsetList = TVector; using TPieceIndex = THashMap; using TSizePair = std::pair; using TSizePairVector = TVector; using TSizePairVectorVector = TVector; class TOffsetMap { protected: TSizePairVectorVector Data; public: TOffsetMap() { Data.reserve(0x10000); } void Add(size_t key, size_t value) { size_t hikey = key & 0xFFFF; if (Data.size() <= hikey) Data.resize(hikey + 1); TSizePairVector& sublist = Data[hikey]; for (auto& it : sublist) { if (it.first == key) { it.second = value; return; } } sublist.push_back(TSizePair(key, value)); } bool Contains(size_t key) const { return (Get(key) != 0); } size_t Get(size_t key) const { size_t hikey = key & 0xFFFF; if (Data.size() <= hikey) return 0; const TSizePairVector& sublist = Data[hikey]; for (const auto& it : sublist) { if (it.first == key) return it.second; } return 0; } }; class TOffsetDeltas { protected: TSizePairVector Data; public: void Add(size_t key, size_t value) { if (Data.empty()) { if (key == value) return; // no offset } else { TSizePair last = Data.back(); if (key <= last.first) { Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl; return; } if (last.first + value == last.second + key) return; // same offset } Data.push_back(TSizePair(key, value)); } size_t Get(size_t key) const { if (Data.empty()) return key; // difference is zero; if (key < Data.front().first) return key; // Binary search for the highest entry in the list that does not exceed the key size_t from = 0; size_t to = Data.size() - 1; while (from < to) { size_t midpoint = (from + to + 1) / 2; if (key < Data[midpoint].first) to = midpoint - 1; else from = midpoint; } TSizePair entry = Data[from]; return key - entry.first + entry.second; } }; class TPieceComparer { private: const char* Data; const size_t Length; public: TPieceComparer(const char* buf, size_t len) : Data(buf) , Length(len) { } bool operator()(size_t p1, const size_t p2) { int res = memcmp(Data + p1, Data + p2, Length); if (res) return (res > 0); return (p1 > p2); // the pieces are sorted in the reverse order of appearance } }; struct TBranchPoint { TNode Node; int Selector; public: TBranchPoint() : Selector(0) { } TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction) : Node(data, offset, skipFunction) , Selector(0) { } bool IsFinal() const { return Node.IsFinal(); } // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward size_t NextNode(const TOffsetMap& mergedNodes) { while (Selector < 3) { size_t nextOffset = 0; switch (++Selector) { case 1: if (Node.GetRightOffset()) nextOffset = Node.GetRightOffset(); break; case 2: if (Node.GetLeftOffset()) nextOffset = Node.GetLeftOffset(); break; case 3: if (Node.GetForwardOffset()) nextOffset = Node.GetForwardOffset(); break; default: break; } if (nextOffset && !mergedNodes.Contains(nextOffset)) return nextOffset; } return 0; } }; class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator { private: bool Fresh; TOpaqueTrie Trie; const TOffsetMap& MergeMap; TVector Trace; TOffsetDeltas OffsetIndex; public: TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers) : Fresh(true) , Trie(trie) , MergeMap(mergers) { } bool Move() override { if (Fresh) { Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction)); Fresh = false; } else { if (Trace.empty()) return false; Trace.pop_back(); if (Trace.empty()) return false; } size_t nextnode = Trace.back().NextNode(MergeMap); while (nextnode) { Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction)); nextnode = Trace.back().NextNode(MergeMap); } return (!Trace.empty()); } const TNode& Get() const { return Trace.back().Node; } size_t GetLeafLength() const override { return Get().GetLeafLength(); } // Returns recalculated offset from the end of the current node size_t PrepareOffset(size_t absoffset, size_t minilength) { if (!absoffset) return NPOS; if (MergeMap.Contains(absoffset)) absoffset = MergeMap.Get(absoffset); return minilength - OffsetIndex.Get(Trie.Length - absoffset); } size_t RecreateNode(char* buffer, size_t resultLength) override { TWriteableNode newNode(Get(), Trie.Data); newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); if (!buffer) return newNode.Measure(); const size_t len = newNode.Pack(buffer); OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len); return len; } }; } static void AddPiece(TPieceIndex& index, size_t offset, size_t len) { index[len].push_back(offset); } static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) { // Tree nodes, arranged by span length. // When all nodes of a given size are considered, they pop off the queue. TPieceIndex subtries; TOffsetMap merger; // Start walking the trie from head. AddPiece(subtries, 0, trie.Length); size_t counter = 0; // Now consider all nodes with sizeable continuations for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) { TPieceIndex::iterator iit = subtries.find(curlen); if (iit == subtries.end()) continue; // fast forward to the next available length value TOffsetList& batch = iit->second; TPieceComparer comparer(trie.Data, curlen); Sort(batch.begin(), batch.end(), comparer); TOffsetList::iterator it = batch.begin(); while (it != batch.end()) { if (verbose) ShowProgress(++counter); size_t offset = *it; // Fill the array with the subnodes of the element TNode node(trie.Data, offset, trie.SkipFunction); size_t end = offset + curlen; if (size_t rightOffset = node.GetRightOffset()) { AddPiece(subtries, rightOffset, end - rightOffset); end = rightOffset; } if (size_t leftOffset = node.GetLeftOffset()) { AddPiece(subtries, leftOffset, end - leftOffset); end = leftOffset; } if (size_t forwardOffset = node.GetForwardOffset()) { AddPiece(subtries, forwardOffset, end - forwardOffset); } while (++it != batch.end()) { // Find next different; until then, just add the offsets to the list of merged nodes. size_t nextoffset = *it; if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen)) break; merger.Add(nextoffset, offset); } } subtries.erase(curlen); } if (verbose) { Cerr << counter << Endl; } return merger; } size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) { if (!trie.Data || !trie.Length) { return 0; } TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize); TMergingReverseNodeEnumerator enumerator(trie, merger); if (mode == MM_DEFAULT) return WriteTrieBackwards(os, enumerator, verbose); else return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode); } }