SuffixTree.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // This file defines the Suffix Tree class and Suffix Tree Node struct.
  15. //
  16. //===----------------------------------------------------------------------===//
  17. #ifndef LLVM_SUPPORT_SUFFIXTREE_H
  18. #define LLVM_SUPPORT_SUFFIXTREE_H
  19. #include "llvm/ADT/ArrayRef.h"
  20. #include "llvm/ADT/DenseMap.h"
  21. #include "llvm/Support/Allocator.h"
  22. #include <vector>
  23. namespace llvm {
  24. /// Represents an undefined index in the suffix tree.
  25. const unsigned EmptyIdx = -1;
  26. /// A node in a suffix tree which represents a substring or suffix.
  27. ///
  28. /// Each node has either no children or at least two children, with the root
  29. /// being a exception in the empty tree.
  30. ///
  31. /// Children are represented as a map between unsigned integers and nodes. If
  32. /// a node N has a child M on unsigned integer k, then the mapping represented
  33. /// by N is a proper prefix of the mapping represented by M. Note that this,
  34. /// although similar to a trie is somewhat different: each node stores a full
  35. /// substring of the full mapping rather than a single character state.
  36. ///
  37. /// Each internal node contains a pointer to the internal node representing
  38. /// the same string, but with the first character chopped off. This is stored
  39. /// in \p Link. Each leaf node stores the start index of its respective
  40. /// suffix in \p SuffixIdx.
  41. struct SuffixTreeNode {
  42. /// The children of this node.
  43. ///
  44. /// A child existing on an unsigned integer implies that from the mapping
  45. /// represented by the current node, there is a way to reach another
  46. /// mapping by tacking that character on the end of the current string.
  47. llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
  48. /// The start index of this node's substring in the main string.
  49. unsigned StartIdx = EmptyIdx;
  50. /// The end index of this node's substring in the main string.
  51. ///
  52. /// Every leaf node must have its \p EndIdx incremented at the end of every
  53. /// step in the construction algorithm. To avoid having to update O(N)
  54. /// nodes individually at the end of every step, the end index is stored
  55. /// as a pointer.
  56. unsigned *EndIdx = nullptr;
  57. /// For leaves, the start index of the suffix represented by this node.
  58. ///
  59. /// For all other nodes, this is ignored.
  60. unsigned SuffixIdx = EmptyIdx;
  61. /// For internal nodes, a pointer to the internal node representing
  62. /// the same sequence with the first character chopped off.
  63. ///
  64. /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
  65. /// Ukkonen's algorithm does to achieve linear-time construction is
  66. /// keep track of which node the next insert should be at. This makes each
  67. /// insert O(1), and there are a total of O(N) inserts. The suffix link
  68. /// helps with inserting children of internal nodes.
  69. ///
  70. /// Say we add a child to an internal node with associated mapping S. The
  71. /// next insertion must be at the node representing S - its first character.
  72. /// This is given by the way that we iteratively build the tree in Ukkonen's
  73. /// algorithm. The main idea is to look at the suffixes of each prefix in the
  74. /// string, starting with the longest suffix of the prefix, and ending with
  75. /// the shortest. Therefore, if we keep pointers between such nodes, we can
  76. /// move to the next insertion point in O(1) time. If we don't, then we'd
  77. /// have to query from the root, which takes O(N) time. This would make the
  78. /// construction algorithm O(N^2) rather than O(N).
  79. SuffixTreeNode *Link = nullptr;
  80. /// The length of the string formed by concatenating the edge labels from the
  81. /// root to this node.
  82. unsigned ConcatLen = 0;
  83. /// Returns true if this node is a leaf.
  84. bool isLeaf() const { return SuffixIdx != EmptyIdx; }
  85. /// Returns true if this node is the root of its owning \p SuffixTree.
  86. bool isRoot() const { return StartIdx == EmptyIdx; }
  87. /// Return the number of elements in the substring associated with this node.
  88. size_t size() const {
  89. // Is it the root? If so, it's the empty string so return 0.
  90. if (isRoot())
  91. return 0;
  92. assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
  93. // Size = the number of elements in the string.
  94. // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
  95. return *EndIdx - StartIdx + 1;
  96. }
  97. SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
  98. : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
  99. SuffixTreeNode() = default;
  100. };
  101. /// A data structure for fast substring queries.
  102. ///
  103. /// Suffix trees represent the suffixes of their input strings in their leaves.
  104. /// A suffix tree is a type of compressed trie structure where each node
  105. /// represents an entire substring rather than a single character. Each leaf
  106. /// of the tree is a suffix.
  107. ///
  108. /// A suffix tree can be seen as a type of state machine where each state is a
  109. /// substring of the full string. The tree is structured so that, for a string
  110. /// of length N, there are exactly N leaves in the tree. This structure allows
  111. /// us to quickly find repeated substrings of the input string.
  112. ///
  113. /// In this implementation, a "string" is a vector of unsigned integers.
  114. /// These integers may result from hashing some data type. A suffix tree can
  115. /// contain 1 or many strings, which can then be queried as one large string.
  116. ///
  117. /// The suffix tree is implemented using Ukkonen's algorithm for linear-time
  118. /// suffix tree construction. Ukkonen's algorithm is explained in more detail
  119. /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
  120. /// paper is available at
  121. ///
  122. /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
  123. class SuffixTree {
  124. public:
  125. /// Each element is an integer representing an instruction in the module.
  126. llvm::ArrayRef<unsigned> Str;
  127. /// A repeated substring in the tree.
  128. struct RepeatedSubstring {
  129. /// The length of the string.
  130. unsigned Length;
  131. /// The start indices of each occurrence.
  132. std::vector<unsigned> StartIndices;
  133. };
  134. private:
  135. /// Maintains each node in the tree.
  136. llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
  137. /// The root of the suffix tree.
  138. ///
  139. /// The root represents the empty string. It is maintained by the
  140. /// \p NodeAllocator like every other node in the tree.
  141. SuffixTreeNode *Root = nullptr;
  142. /// Maintains the end indices of the internal nodes in the tree.
  143. ///
  144. /// Each internal node is guaranteed to never have its end index change
  145. /// during the construction algorithm; however, leaves must be updated at
  146. /// every step. Therefore, we need to store leaf end indices by reference
  147. /// to avoid updating O(N) leaves at every step of construction. Thus,
  148. /// every internal node must be allocated its own end index.
  149. llvm::BumpPtrAllocator InternalEndIdxAllocator;
  150. /// The end index of each leaf in the tree.
  151. unsigned LeafEndIdx = -1;
  152. /// Helper struct which keeps track of the next insertion point in
  153. /// Ukkonen's algorithm.
  154. struct ActiveState {
  155. /// The next node to insert at.
  156. SuffixTreeNode *Node = nullptr;
  157. /// The index of the first character in the substring currently being added.
  158. unsigned Idx = EmptyIdx;
  159. /// The length of the substring we have to add at the current step.
  160. unsigned Len = 0;
  161. };
  162. /// The point the next insertion will take place at in the
  163. /// construction algorithm.
  164. ActiveState Active;
  165. /// Allocate a leaf node and add it to the tree.
  166. ///
  167. /// \param Parent The parent of this node.
  168. /// \param StartIdx The start index of this node's associated string.
  169. /// \param Edge The label on the edge leaving \p Parent to this node.
  170. ///
  171. /// \returns A pointer to the allocated leaf node.
  172. SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
  173. unsigned Edge);
  174. /// Allocate an internal node and add it to the tree.
  175. ///
  176. /// \param Parent The parent of this node. Only null when allocating the root.
  177. /// \param StartIdx The start index of this node's associated string.
  178. /// \param EndIdx The end index of this node's associated string.
  179. /// \param Edge The label on the edge leaving \p Parent to this node.
  180. ///
  181. /// \returns A pointer to the allocated internal node.
  182. SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
  183. unsigned EndIdx, unsigned Edge);
  184. /// Set the suffix indices of the leaves to the start indices of their
  185. /// respective suffixes.
  186. void setSuffixIndices();
  187. /// Construct the suffix tree for the prefix of the input ending at
  188. /// \p EndIdx.
  189. ///
  190. /// Used to construct the full suffix tree iteratively. At the end of each
  191. /// step, the constructed suffix tree is either a valid suffix tree, or a
  192. /// suffix tree with implicit suffixes. At the end of the final step, the
  193. /// suffix tree is a valid tree.
  194. ///
  195. /// \param EndIdx The end index of the current prefix in the main string.
  196. /// \param SuffixesToAdd The number of suffixes that must be added
  197. /// to complete the suffix tree at the current phase.
  198. ///
  199. /// \returns The number of suffixes that have not been added at the end of
  200. /// this step.
  201. unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
  202. public:
  203. /// Construct a suffix tree from a sequence of unsigned integers.
  204. ///
  205. /// \param Str The string to construct the suffix tree for.
  206. SuffixTree(const std::vector<unsigned> &Str);
  207. /// Iterator for finding all repeated substrings in the suffix tree.
  208. struct RepeatedSubstringIterator {
  209. private:
  210. /// The current node we're visiting.
  211. SuffixTreeNode *N = nullptr;
  212. /// The repeated substring associated with this node.
  213. RepeatedSubstring RS;
  214. /// The nodes left to visit.
  215. std::vector<SuffixTreeNode *> ToVisit;
  216. /// The minimum length of a repeated substring to find.
  217. /// Since we're outlining, we want at least two instructions in the range.
  218. /// FIXME: This may not be true for targets like X86 which support many
  219. /// instruction lengths.
  220. const unsigned MinLength = 2;
  221. /// Move the iterator to the next repeated substring.
  222. void advance() {
  223. // Clear the current state. If we're at the end of the range, then this
  224. // is the state we want to be in.
  225. RS = RepeatedSubstring();
  226. N = nullptr;
  227. // Each leaf node represents a repeat of a string.
  228. std::vector<SuffixTreeNode *> LeafChildren;
  229. // Continue visiting nodes until we find one which repeats more than once.
  230. while (!ToVisit.empty()) {
  231. SuffixTreeNode *Curr = ToVisit.back();
  232. ToVisit.pop_back();
  233. LeafChildren.clear();
  234. // Keep track of the length of the string associated with the node. If
  235. // it's too short, we'll quit.
  236. unsigned Length = Curr->ConcatLen;
  237. // Iterate over each child, saving internal nodes for visiting, and
  238. // leaf nodes in LeafChildren. Internal nodes represent individual
  239. // strings, which may repeat.
  240. for (auto &ChildPair : Curr->Children) {
  241. // Save all of this node's children for processing.
  242. if (!ChildPair.second->isLeaf())
  243. ToVisit.push_back(ChildPair.second);
  244. // It's not an internal node, so it must be a leaf. If we have a
  245. // long enough string, then save the leaf children.
  246. else if (Length >= MinLength)
  247. LeafChildren.push_back(ChildPair.second);
  248. }
  249. // The root never represents a repeated substring. If we're looking at
  250. // that, then skip it.
  251. if (Curr->isRoot())
  252. continue;
  253. // Do we have any repeated substrings?
  254. if (LeafChildren.size() >= 2) {
  255. // Yes. Update the state to reflect this, and then bail out.
  256. N = Curr;
  257. RS.Length = Length;
  258. for (SuffixTreeNode *Leaf : LeafChildren)
  259. RS.StartIndices.push_back(Leaf->SuffixIdx);
  260. break;
  261. }
  262. }
  263. // At this point, either NewRS is an empty RepeatedSubstring, or it was
  264. // set in the above loop. Similarly, N is either nullptr, or the node
  265. // associated with NewRS.
  266. }
  267. public:
  268. /// Return the current repeated substring.
  269. RepeatedSubstring &operator*() { return RS; }
  270. RepeatedSubstringIterator &operator++() {
  271. advance();
  272. return *this;
  273. }
  274. RepeatedSubstringIterator operator++(int I) {
  275. RepeatedSubstringIterator It(*this);
  276. advance();
  277. return It;
  278. }
  279. bool operator==(const RepeatedSubstringIterator &Other) const {
  280. return N == Other.N;
  281. }
  282. bool operator!=(const RepeatedSubstringIterator &Other) const {
  283. return !(*this == Other);
  284. }
  285. RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
  286. // Do we have a non-null node?
  287. if (N) {
  288. // Yes. At the first step, we need to visit all of N's children.
  289. // Note: This means that we visit N last.
  290. ToVisit.push_back(N);
  291. advance();
  292. }
  293. }
  294. };
  295. typedef RepeatedSubstringIterator iterator;
  296. iterator begin() { return iterator(Root); }
  297. iterator end() { return iterator(nullptr); }
  298. };
  299. } // namespace llvm
  300. #endif // LLVM_SUPPORT_SUFFIXTREE_H
  301. #ifdef __GNUC__
  302. #pragma GCC diagnostic pop
  303. #endif