SuffixTree.cpp 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the Suffix Tree class.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/Support/SuffixTree.h"
  13. #include "llvm/Support/Allocator.h"
  14. #include <vector>
  15. using namespace llvm;
  16. SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
  17. Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
  18. Active.Node = Root;
  19. // Keep track of the number of suffixes we have to add of the current
  20. // prefix.
  21. unsigned SuffixesToAdd = 0;
  22. // Construct the suffix tree iteratively on each prefix of the string.
  23. // PfxEndIdx is the end index of the current prefix.
  24. // End is one past the last element in the string.
  25. for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
  26. SuffixesToAdd++;
  27. LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
  28. SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
  29. }
  30. // Set the suffix indices of each leaf.
  31. assert(Root && "Root node can't be nullptr!");
  32. setSuffixIndices();
  33. }
  34. SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
  35. unsigned StartIdx, unsigned Edge) {
  36. assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
  37. SuffixTreeNode *N = new (NodeAllocator.Allocate())
  38. SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
  39. Parent.Children[Edge] = N;
  40. return N;
  41. }
  42. SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
  43. unsigned StartIdx,
  44. unsigned EndIdx, unsigned Edge) {
  45. assert(StartIdx <= EndIdx && "String can't start after it ends!");
  46. assert(!(!Parent && StartIdx != EmptyIdx) &&
  47. "Non-root internal nodes must have parents!");
  48. unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
  49. SuffixTreeNode *N =
  50. new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
  51. if (Parent)
  52. Parent->Children[Edge] = N;
  53. return N;
  54. }
  55. void SuffixTree::setSuffixIndices() {
  56. // List of nodes we need to visit along with the current length of the
  57. // string.
  58. std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
  59. // Current node being visited.
  60. SuffixTreeNode *CurrNode = Root;
  61. // Sum of the lengths of the nodes down the path to the current one.
  62. unsigned CurrNodeLen = 0;
  63. ToVisit.push_back({CurrNode, CurrNodeLen});
  64. while (!ToVisit.empty()) {
  65. std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
  66. ToVisit.pop_back();
  67. CurrNode->ConcatLen = CurrNodeLen;
  68. for (auto &ChildPair : CurrNode->Children) {
  69. assert(ChildPair.second && "Node had a null child!");
  70. ToVisit.push_back(
  71. {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
  72. }
  73. // No children, so we are at the end of the string.
  74. if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
  75. CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
  76. }
  77. }
  78. unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
  79. SuffixTreeNode *NeedsLink = nullptr;
  80. while (SuffixesToAdd > 0) {
  81. // Are we waiting to add anything other than just the last character?
  82. if (Active.Len == 0) {
  83. // If not, then say the active index is the end index.
  84. Active.Idx = EndIdx;
  85. }
  86. assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
  87. // The first character in the current substring we're looking at.
  88. unsigned FirstChar = Str[Active.Idx];
  89. // Have we inserted anything starting with FirstChar at the current node?
  90. if (Active.Node->Children.count(FirstChar) == 0) {
  91. // If not, then we can just insert a leaf and move to the next step.
  92. insertLeaf(*Active.Node, EndIdx, FirstChar);
  93. // The active node is an internal node, and we visited it, so it must
  94. // need a link if it doesn't have one.
  95. if (NeedsLink) {
  96. NeedsLink->Link = Active.Node;
  97. NeedsLink = nullptr;
  98. }
  99. } else {
  100. // There's a match with FirstChar, so look for the point in the tree to
  101. // insert a new node.
  102. SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
  103. unsigned SubstringLen = NextNode->size();
  104. // Is the current suffix we're trying to insert longer than the size of
  105. // the child we want to move to?
  106. if (Active.Len >= SubstringLen) {
  107. // If yes, then consume the characters we've seen and move to the next
  108. // node.
  109. Active.Idx += SubstringLen;
  110. Active.Len -= SubstringLen;
  111. Active.Node = NextNode;
  112. continue;
  113. }
  114. // Otherwise, the suffix we're trying to insert must be contained in the
  115. // next node we want to move to.
  116. unsigned LastChar = Str[EndIdx];
  117. // Is the string we're trying to insert a substring of the next node?
  118. if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
  119. // If yes, then we're done for this step. Remember our insertion point
  120. // and move to the next end index. At this point, we have an implicit
  121. // suffix tree.
  122. if (NeedsLink && !Active.Node->isRoot()) {
  123. NeedsLink->Link = Active.Node;
  124. NeedsLink = nullptr;
  125. }
  126. Active.Len++;
  127. break;
  128. }
  129. // The string we're trying to insert isn't a substring of the next node,
  130. // but matches up to a point. Split the node.
  131. //
  132. // For example, say we ended our search at a node n and we're trying to
  133. // insert ABD. Then we'll create a new node s for AB, reduce n to just
  134. // representing C, and insert a new leaf node l to represent d. This
  135. // allows us to ensure that if n was a leaf, it remains a leaf.
  136. //
  137. // | ABC ---split---> | AB
  138. // n s
  139. // C / \ D
  140. // n l
  141. // The node s from the diagram
  142. SuffixTreeNode *SplitNode =
  143. insertInternalNode(Active.Node, NextNode->StartIdx,
  144. NextNode->StartIdx + Active.Len - 1, FirstChar);
  145. // Insert the new node representing the new substring into the tree as
  146. // a child of the split node. This is the node l from the diagram.
  147. insertLeaf(*SplitNode, EndIdx, LastChar);
  148. // Make the old node a child of the split node and update its start
  149. // index. This is the node n from the diagram.
  150. NextNode->StartIdx += Active.Len;
  151. SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
  152. // SplitNode is an internal node, update the suffix link.
  153. if (NeedsLink)
  154. NeedsLink->Link = SplitNode;
  155. NeedsLink = SplitNode;
  156. }
  157. // We've added something new to the tree, so there's one less suffix to
  158. // add.
  159. SuffixesToAdd--;
  160. if (Active.Node->isRoot()) {
  161. if (Active.Len > 0) {
  162. Active.Len--;
  163. Active.Idx = EndIdx - SuffixesToAdd + 1;
  164. }
  165. } else {
  166. // Start the next phase at the next smallest suffix.
  167. Active.Node = Active.Node->Link;
  168. }
  169. }
  170. return SuffixesToAdd;
  171. }