minimize.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. #include "minimize.h"
  2. #include "node.h"
  3. #include "writeable_node.h"
  4. #include "write_trie_backwards.h"
  5. #include "comptrie_impl.h"
  6. #include <util/generic/hash.h>
  7. #include <util/generic/algorithm.h>
  8. namespace NCompactTrie {
  9. // Minimize the trie. The result is equivalent to the original
  10. // trie, except that it takes less space (and has marginally lower
  11. // performance, because of eventual epsilon links).
  12. // The algorithm is as follows: starting from the largest pieces, we find
  13. // nodes that have identical continuations (Daciuk's right language),
  14. // and repack the trie. Repacking is done in-place, so memory is less
  15. // of an issue; however, it may take considerable time.
  16. // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
  17. // Because of non-local structure and epsilon links, it won't work
  18. // as you expect it to, and can destroy the trie in the making.
  19. namespace {
  20. using TOffsetList = TVector<size_t>;
  21. using TPieceIndex = THashMap<size_t, TOffsetList>;
  22. using TSizePair = std::pair<size_t, size_t>;
  23. using TSizePairVector = TVector<TSizePair>;
  24. using TSizePairVectorVector = TVector<TSizePairVector>;
  25. class TOffsetMap {
  26. protected:
  27. TSizePairVectorVector Data;
  28. public:
  29. TOffsetMap() {
  30. Data.reserve(0x10000);
  31. }
  32. void Add(size_t key, size_t value) {
  33. size_t hikey = key & 0xFFFF;
  34. if (Data.size() <= hikey)
  35. Data.resize(hikey + 1);
  36. TSizePairVector& sublist = Data[hikey];
  37. for (auto& it : sublist) {
  38. if (it.first == key) {
  39. it.second = value;
  40. return;
  41. }
  42. }
  43. sublist.push_back(TSizePair(key, value));
  44. }
  45. bool Contains(size_t key) const {
  46. return (Get(key) != 0);
  47. }
  48. size_t Get(size_t key) const {
  49. size_t hikey = key & 0xFFFF;
  50. if (Data.size() <= hikey)
  51. return 0;
  52. const TSizePairVector& sublist = Data[hikey];
  53. for (const auto& it : sublist) {
  54. if (it.first == key)
  55. return it.second;
  56. }
  57. return 0;
  58. }
  59. };
  60. class TOffsetDeltas {
  61. protected:
  62. TSizePairVector Data;
  63. public:
  64. void Add(size_t key, size_t value) {
  65. if (Data.empty()) {
  66. if (key == value)
  67. return; // no offset
  68. } else {
  69. TSizePair last = Data.back();
  70. if (key <= last.first) {
  71. Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
  72. return;
  73. }
  74. if (last.first + value == last.second + key)
  75. return; // same offset
  76. }
  77. Data.push_back(TSizePair(key, value));
  78. }
  79. size_t Get(size_t key) const {
  80. if (Data.empty())
  81. return key; // difference is zero;
  82. if (key < Data.front().first)
  83. return key;
  84. // Binary search for the highest entry in the list that does not exceed the key
  85. size_t from = 0;
  86. size_t to = Data.size() - 1;
  87. while (from < to) {
  88. size_t midpoint = (from + to + 1) / 2;
  89. if (key < Data[midpoint].first)
  90. to = midpoint - 1;
  91. else
  92. from = midpoint;
  93. }
  94. TSizePair entry = Data[from];
  95. return key - entry.first + entry.second;
  96. }
  97. };
  98. class TPieceComparer {
  99. private:
  100. const char* Data;
  101. const size_t Length;
  102. public:
  103. TPieceComparer(const char* buf, size_t len)
  104. : Data(buf)
  105. , Length(len)
  106. {
  107. }
  108. bool operator()(size_t p1, const size_t p2) {
  109. int res = memcmp(Data + p1, Data + p2, Length);
  110. if (res)
  111. return (res > 0);
  112. return (p1 > p2); // the pieces are sorted in the reverse order of appearance
  113. }
  114. };
  115. struct TBranchPoint {
  116. TNode Node;
  117. int Selector;
  118. public:
  119. TBranchPoint()
  120. : Selector(0)
  121. {
  122. }
  123. TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
  124. : Node(data, offset, skipFunction)
  125. , Selector(0)
  126. {
  127. }
  128. bool IsFinal() const {
  129. return Node.IsFinal();
  130. }
  131. // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
  132. size_t NextNode(const TOffsetMap& mergedNodes) {
  133. while (Selector < 3) {
  134. size_t nextOffset = 0;
  135. switch (++Selector) {
  136. case 1:
  137. if (Node.GetRightOffset())
  138. nextOffset = Node.GetRightOffset();
  139. break;
  140. case 2:
  141. if (Node.GetLeftOffset())
  142. nextOffset = Node.GetLeftOffset();
  143. break;
  144. case 3:
  145. if (Node.GetForwardOffset())
  146. nextOffset = Node.GetForwardOffset();
  147. break;
  148. default:
  149. break;
  150. }
  151. if (nextOffset && !mergedNodes.Contains(nextOffset))
  152. return nextOffset;
  153. }
  154. return 0;
  155. }
  156. };
  157. class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
  158. private:
  159. bool Fresh;
  160. TOpaqueTrie Trie;
  161. const TOffsetMap& MergeMap;
  162. TVector<TBranchPoint> Trace;
  163. TOffsetDeltas OffsetIndex;
  164. public:
  165. TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
  166. : Fresh(true)
  167. , Trie(trie)
  168. , MergeMap(mergers)
  169. {
  170. }
  171. bool Move() override {
  172. if (Fresh) {
  173. Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
  174. Fresh = false;
  175. } else {
  176. if (Trace.empty())
  177. return false;
  178. Trace.pop_back();
  179. if (Trace.empty())
  180. return false;
  181. }
  182. size_t nextnode = Trace.back().NextNode(MergeMap);
  183. while (nextnode) {
  184. Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
  185. nextnode = Trace.back().NextNode(MergeMap);
  186. }
  187. return (!Trace.empty());
  188. }
  189. const TNode& Get() const {
  190. return Trace.back().Node;
  191. }
  192. size_t GetLeafLength() const override {
  193. return Get().GetLeafLength();
  194. }
  195. // Returns recalculated offset from the end of the current node
  196. size_t PrepareOffset(size_t absoffset, size_t minilength) {
  197. if (!absoffset)
  198. return NPOS;
  199. if (MergeMap.Contains(absoffset))
  200. absoffset = MergeMap.Get(absoffset);
  201. return minilength - OffsetIndex.Get(Trie.Length - absoffset);
  202. }
  203. size_t RecreateNode(char* buffer, size_t resultLength) override {
  204. TWriteableNode newNode(Get(), Trie.Data);
  205. newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
  206. newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
  207. newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
  208. if (!buffer)
  209. return newNode.Measure();
  210. const size_t len = newNode.Pack(buffer);
  211. OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
  212. return len;
  213. }
  214. };
  215. }
  216. static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
  217. index[len].push_back(offset);
  218. }
  219. static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
  220. // Tree nodes, arranged by span length.
  221. // When all nodes of a given size are considered, they pop off the queue.
  222. TPieceIndex subtries;
  223. TOffsetMap merger;
  224. // Start walking the trie from head.
  225. AddPiece(subtries, 0, trie.Length);
  226. size_t counter = 0;
  227. // Now consider all nodes with sizeable continuations
  228. for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
  229. TPieceIndex::iterator iit = subtries.find(curlen);
  230. if (iit == subtries.end())
  231. continue; // fast forward to the next available length value
  232. TOffsetList& batch = iit->second;
  233. TPieceComparer comparer(trie.Data, curlen);
  234. Sort(batch.begin(), batch.end(), comparer);
  235. TOffsetList::iterator it = batch.begin();
  236. while (it != batch.end()) {
  237. if (verbose)
  238. ShowProgress(++counter);
  239. size_t offset = *it;
  240. // Fill the array with the subnodes of the element
  241. TNode node(trie.Data, offset, trie.SkipFunction);
  242. size_t end = offset + curlen;
  243. if (size_t rightOffset = node.GetRightOffset()) {
  244. AddPiece(subtries, rightOffset, end - rightOffset);
  245. end = rightOffset;
  246. }
  247. if (size_t leftOffset = node.GetLeftOffset()) {
  248. AddPiece(subtries, leftOffset, end - leftOffset);
  249. end = leftOffset;
  250. }
  251. if (size_t forwardOffset = node.GetForwardOffset()) {
  252. AddPiece(subtries, forwardOffset, end - forwardOffset);
  253. }
  254. while (++it != batch.end()) {
  255. // Find next different; until then, just add the offsets to the list of merged nodes.
  256. size_t nextoffset = *it;
  257. if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
  258. break;
  259. merger.Add(nextoffset, offset);
  260. }
  261. }
  262. subtries.erase(curlen);
  263. }
  264. if (verbose) {
  265. Cerr << counter << Endl;
  266. }
  267. return merger;
  268. }
  269. size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
  270. if (!trie.Data || !trie.Length) {
  271. return 0;
  272. }
  273. TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
  274. TMergingReverseNodeEnumerator enumerator(trie, merger);
  275. if (mode == MM_DEFAULT)
  276. return WriteTrieBackwards(os, enumerator, verbose);
  277. else
  278. return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
  279. }
  280. }