123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359 |
- #include "minimize.h"
- #include "node.h"
- #include "writeable_node.h"
- #include "write_trie_backwards.h"
- #include "comptrie_impl.h"
- #include <util/generic/hash.h>
- #include <util/generic/algorithm.h>
- namespace NCompactTrie {
- // Minimize the trie. The result is equivalent to the original
- // trie, except that it takes less space (and has marginally lower
- // performance, because of eventual epsilon links).
- // The algorithm is as follows: starting from the largest pieces, we find
- // nodes that have identical continuations (Daciuk's right language),
- // and repack the trie. Repacking is done in-place, so memory is less
- // of an issue; however, it may take considerable time.
- // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
- // Because of non-local structure and epsilon links, it won't work
- // as you expect it to, and can destroy the trie in the making.
- namespace {
- using TOffsetList = TVector<size_t>;
- using TPieceIndex = THashMap<size_t, TOffsetList>;
- using TSizePair = std::pair<size_t, size_t>;
- using TSizePairVector = TVector<TSizePair>;
- using TSizePairVectorVector = TVector<TSizePairVector>;
- class TOffsetMap {
- protected:
- TSizePairVectorVector Data;
- public:
- TOffsetMap() {
- Data.reserve(0x10000);
- }
- void Add(size_t key, size_t value) {
- size_t hikey = key & 0xFFFF;
- if (Data.size() <= hikey)
- Data.resize(hikey + 1);
- TSizePairVector& sublist = Data[hikey];
- for (auto& it : sublist) {
- if (it.first == key) {
- it.second = value;
- return;
- }
- }
- sublist.push_back(TSizePair(key, value));
- }
- bool Contains(size_t key) const {
- return (Get(key) != 0);
- }
- size_t Get(size_t key) const {
- size_t hikey = key & 0xFFFF;
- if (Data.size() <= hikey)
- return 0;
- const TSizePairVector& sublist = Data[hikey];
- for (const auto& it : sublist) {
- if (it.first == key)
- return it.second;
- }
- return 0;
- }
- };
- class TOffsetDeltas {
- protected:
- TSizePairVector Data;
- public:
- void Add(size_t key, size_t value) {
- if (Data.empty()) {
- if (key == value)
- return; // no offset
- } else {
- TSizePair last = Data.back();
- if (key <= last.first) {
- Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
- return;
- }
- if (last.first + value == last.second + key)
- return; // same offset
- }
- Data.push_back(TSizePair(key, value));
- }
- size_t Get(size_t key) const {
- if (Data.empty())
- return key; // difference is zero;
- if (key < Data.front().first)
- return key;
- // Binary search for the highest entry in the list that does not exceed the key
- size_t from = 0;
- size_t to = Data.size() - 1;
- while (from < to) {
- size_t midpoint = (from + to + 1) / 2;
- if (key < Data[midpoint].first)
- to = midpoint - 1;
- else
- from = midpoint;
- }
- TSizePair entry = Data[from];
- return key - entry.first + entry.second;
- }
- };
- class TPieceComparer {
- private:
- const char* Data;
- const size_t Length;
- public:
- TPieceComparer(const char* buf, size_t len)
- : Data(buf)
- , Length(len)
- {
- }
- bool operator()(size_t p1, const size_t p2) {
- int res = memcmp(Data + p1, Data + p2, Length);
- if (res)
- return (res > 0);
- return (p1 > p2); // the pieces are sorted in the reverse order of appearance
- }
- };
- struct TBranchPoint {
- TNode Node;
- int Selector;
- public:
- TBranchPoint()
- : Selector(0)
- {
- }
- TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
- : Node(data, offset, skipFunction)
- , Selector(0)
- {
- }
- bool IsFinal() const {
- return Node.IsFinal();
- }
- // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
- size_t NextNode(const TOffsetMap& mergedNodes) {
- while (Selector < 3) {
- size_t nextOffset = 0;
- switch (++Selector) {
- case 1:
- if (Node.GetRightOffset())
- nextOffset = Node.GetRightOffset();
- break;
- case 2:
- if (Node.GetLeftOffset())
- nextOffset = Node.GetLeftOffset();
- break;
- case 3:
- if (Node.GetForwardOffset())
- nextOffset = Node.GetForwardOffset();
- break;
- default:
- break;
- }
- if (nextOffset && !mergedNodes.Contains(nextOffset))
- return nextOffset;
- }
- return 0;
- }
- };
- class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
- private:
- bool Fresh;
- TOpaqueTrie Trie;
- const TOffsetMap& MergeMap;
- TVector<TBranchPoint> Trace;
- TOffsetDeltas OffsetIndex;
- public:
- TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
- : Fresh(true)
- , Trie(trie)
- , MergeMap(mergers)
- {
- }
- bool Move() override {
- if (Fresh) {
- Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
- Fresh = false;
- } else {
- if (Trace.empty())
- return false;
- Trace.pop_back();
- if (Trace.empty())
- return false;
- }
- size_t nextnode = Trace.back().NextNode(MergeMap);
- while (nextnode) {
- Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
- nextnode = Trace.back().NextNode(MergeMap);
- }
- return (!Trace.empty());
- }
- const TNode& Get() const {
- return Trace.back().Node;
- }
- size_t GetLeafLength() const override {
- return Get().GetLeafLength();
- }
- // Returns recalculated offset from the end of the current node
- size_t PrepareOffset(size_t absoffset, size_t minilength) {
- if (!absoffset)
- return NPOS;
- if (MergeMap.Contains(absoffset))
- absoffset = MergeMap.Get(absoffset);
- return minilength - OffsetIndex.Get(Trie.Length - absoffset);
- }
- size_t RecreateNode(char* buffer, size_t resultLength) override {
- TWriteableNode newNode(Get(), Trie.Data);
- newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
- newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
- newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
- if (!buffer)
- return newNode.Measure();
- const size_t len = newNode.Pack(buffer);
- OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
- return len;
- }
- };
- }
- static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
- index[len].push_back(offset);
- }
- static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
- // Tree nodes, arranged by span length.
- // When all nodes of a given size are considered, they pop off the queue.
- TPieceIndex subtries;
- TOffsetMap merger;
- // Start walking the trie from head.
- AddPiece(subtries, 0, trie.Length);
- size_t counter = 0;
- // Now consider all nodes with sizeable continuations
- for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
- TPieceIndex::iterator iit = subtries.find(curlen);
- if (iit == subtries.end())
- continue; // fast forward to the next available length value
- TOffsetList& batch = iit->second;
- TPieceComparer comparer(trie.Data, curlen);
- Sort(batch.begin(), batch.end(), comparer);
- TOffsetList::iterator it = batch.begin();
- while (it != batch.end()) {
- if (verbose)
- ShowProgress(++counter);
- size_t offset = *it;
- // Fill the array with the subnodes of the element
- TNode node(trie.Data, offset, trie.SkipFunction);
- size_t end = offset + curlen;
- if (size_t rightOffset = node.GetRightOffset()) {
- AddPiece(subtries, rightOffset, end - rightOffset);
- end = rightOffset;
- }
- if (size_t leftOffset = node.GetLeftOffset()) {
- AddPiece(subtries, leftOffset, end - leftOffset);
- end = leftOffset;
- }
- if (size_t forwardOffset = node.GetForwardOffset()) {
- AddPiece(subtries, forwardOffset, end - forwardOffset);
- }
- while (++it != batch.end()) {
- // Find next different; until then, just add the offsets to the list of merged nodes.
- size_t nextoffset = *it;
- if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
- break;
- merger.Add(nextoffset, offset);
- }
- }
- subtries.erase(curlen);
- }
- if (verbose) {
- Cerr << counter << Endl;
- }
- return merger;
- }
- size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
- if (!trie.Data || !trie.Length) {
- return 0;
- }
- TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
- TMergingReverseNodeEnumerator enumerator(trie, merger);
- if (mode == MM_DEFAULT)
- return WriteTrieBackwards(os, enumerator, verbose);
- else
- return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
- }
- }
|