CloneDetection.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. ///
  14. /// \file
  15. /// This file defines classes for searching and analyzing source code clones.
  16. ///
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
  19. #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
  20. #include "clang/AST/StmtVisitor.h"
  21. #include "llvm/Support/Regex.h"
  22. #include <vector>
  23. namespace clang {
  24. class Stmt;
  25. class Decl;
  26. class VarDecl;
  27. class ASTContext;
  28. class CompoundStmt;
  29. /// Identifies a list of statements.
  30. ///
  31. /// Can either identify a single arbitrary Stmt object, a continuous sequence of
  32. /// child statements inside a CompoundStmt or no statements at all.
  33. class StmtSequence {
  34. /// If this object identifies a sequence of statements inside a CompoundStmt,
  35. /// S points to this CompoundStmt. If this object only identifies a single
  36. /// Stmt, then S is a pointer to this Stmt.
  37. const Stmt *S;
  38. /// The declaration that contains the statements.
  39. const Decl *D;
  40. /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
  41. /// instance is representing the CompoundStmt children inside the array
  42. /// [StartIndex, EndIndex).
  43. unsigned StartIndex;
  44. unsigned EndIndex;
  45. public:
  46. /// Constructs a StmtSequence holding multiple statements.
  47. ///
  48. /// The resulting StmtSequence identifies a continuous sequence of statements
  49. /// in the body of the given CompoundStmt. Which statements of the body should
  50. /// be identified needs to be specified by providing a start and end index
  51. /// that describe a non-empty sub-array in the body of the given CompoundStmt.
  52. ///
  53. /// \param Stmt A CompoundStmt that contains all statements in its body.
  54. /// \param D The Decl containing this Stmt.
  55. /// \param StartIndex The inclusive start index in the children array of
  56. /// \p Stmt
  57. /// \param EndIndex The exclusive end index in the children array of \p Stmt.
  58. StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
  59. unsigned EndIndex);
  60. /// Constructs a StmtSequence holding a single statement.
  61. ///
  62. /// \param Stmt An arbitrary Stmt.
  63. /// \param D The Decl containing this Stmt.
  64. StmtSequence(const Stmt *Stmt, const Decl *D);
  65. /// Constructs an empty StmtSequence.
  66. StmtSequence();
  67. typedef const Stmt *const *iterator;
  68. /// Returns an iterator pointing to the first statement in this sequence.
  69. iterator begin() const;
  70. /// Returns an iterator pointing behind the last statement in this sequence.
  71. iterator end() const;
  72. /// Returns the first statement in this sequence.
  73. ///
  74. /// This method should only be called on a non-empty StmtSequence object.
  75. const Stmt *front() const {
  76. assert(!empty());
  77. return begin()[0];
  78. }
  79. /// Returns the last statement in this sequence.
  80. ///
  81. /// This method should only be called on a non-empty StmtSequence object.
  82. const Stmt *back() const {
  83. assert(!empty());
  84. return begin()[size() - 1];
  85. }
  86. /// Returns the number of statements this object holds.
  87. unsigned size() const {
  88. if (holdsSequence())
  89. return EndIndex - StartIndex;
  90. if (S == nullptr)
  91. return 0;
  92. return 1;
  93. }
  94. /// Returns true if and only if this StmtSequence contains no statements.
  95. bool empty() const { return size() == 0; }
  96. /// Returns the related ASTContext for the stored Stmts.
  97. ASTContext &getASTContext() const;
  98. /// Returns the declaration that contains the stored Stmts.
  99. const Decl *getContainingDecl() const {
  100. assert(D);
  101. return D;
  102. }
  103. /// Returns true if this objects holds a list of statements.
  104. bool holdsSequence() const { return EndIndex != 0; }
  105. /// Returns the start sourcelocation of the first statement in this sequence.
  106. ///
  107. /// This method should only be called on a non-empty StmtSequence object.
  108. SourceLocation getBeginLoc() const;
  109. /// Returns the end sourcelocation of the last statement in this sequence.
  110. ///
  111. /// This method should only be called on a non-empty StmtSequence object.
  112. SourceLocation getEndLoc() const;
  113. /// Returns the source range of the whole sequence - from the beginning
  114. /// of the first statement to the end of the last statement.
  115. SourceRange getSourceRange() const;
  116. bool operator==(const StmtSequence &Other) const {
  117. return std::tie(S, StartIndex, EndIndex) ==
  118. std::tie(Other.S, Other.StartIndex, Other.EndIndex);
  119. }
  120. bool operator!=(const StmtSequence &Other) const {
  121. return std::tie(S, StartIndex, EndIndex) !=
  122. std::tie(Other.S, Other.StartIndex, Other.EndIndex);
  123. }
  124. /// Returns true if and only if this sequence covers a source range that
  125. /// contains the source range of the given sequence \p Other.
  126. ///
  127. /// This method should only be called on a non-empty StmtSequence object
  128. /// and passed a non-empty StmtSequence object.
  129. bool contains(const StmtSequence &Other) const;
  130. };
  131. /// Searches for similar subtrees in the AST.
  132. ///
  133. /// First, this class needs several declarations with statement bodies which
  134. /// can be passed via analyzeCodeBody. Afterwards all statements can be
  135. /// searched for clones by calling findClones with a given list of constraints
  136. /// that should specify the wanted properties of the clones.
  137. ///
  138. /// The result of findClones can be further constrained with the constrainClones
  139. /// method.
  140. ///
  141. /// This class only searches for clones in executable source code
  142. /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
  143. /// are not supported.
  144. class CloneDetector {
  145. public:
  146. /// A collection of StmtSequences that share an arbitrary property.
  147. typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
  148. /// Generates and stores search data for all statements in the body of
  149. /// the given Decl.
  150. void analyzeCodeBody(const Decl *D);
  151. /// Constrains the given list of clone groups with the given constraint.
  152. ///
  153. /// The constraint is expected to have a method with the signature
  154. /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
  155. /// as this is the interface that the CloneDetector uses for applying the
  156. /// constraint. The constraint is supposed to directly modify the passed list
  157. /// so that all clones in the list fulfill the specific property this
  158. /// constraint ensures.
  159. template <typename T>
  160. static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
  161. C.constrain(CloneGroups);
  162. }
  163. /// Constrains the given list of clone groups with the given list of
  164. /// constraints.
  165. ///
  166. /// The constraints are applied in sequence in the order in which they are
  167. /// passed to this function.
  168. template <typename T1, typename... Ts>
  169. static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
  170. Ts... ConstraintList) {
  171. constrainClones(CloneGroups, C);
  172. constrainClones(CloneGroups, ConstraintList...);
  173. }
  174. /// Searches for clones in all previously passed statements.
  175. /// \param Result Output parameter to which all created clone groups are
  176. /// added.
  177. /// \param ConstraintList The constraints that should be applied to the
  178. // result.
  179. template <typename... Ts>
  180. void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
  181. // The initial assumption is that there is only one clone group and every
  182. // statement is a clone of the others. This clone group will then be
  183. // split up with the help of the constraints.
  184. Result.push_back(Sequences);
  185. constrainClones(Result, ConstraintList...);
  186. }
  187. private:
  188. CloneGroup Sequences;
  189. };
  190. /// This class is a utility class that contains utility functions for building
  191. /// custom constraints.
  192. class CloneConstraint {
  193. public:
  194. /// Removes all groups by using a filter function.
  195. /// \param CloneGroups The list of CloneGroups that is supposed to be
  196. /// filtered.
  197. /// \param Filter The filter function that should return true for all groups
  198. /// that should be removed from the list.
  199. static void filterGroups(
  200. std::vector<CloneDetector::CloneGroup> &CloneGroups,
  201. llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
  202. llvm::erase_if(CloneGroups, Filter);
  203. }
  204. /// Splits the given CloneGroups until the given Compare function returns true
  205. /// for all clones in a single group.
  206. /// \param CloneGroups A list of CloneGroups that should be modified.
  207. /// \param Compare The comparison function that all clones are supposed to
  208. /// pass. Should return true if and only if two clones belong
  209. /// to the same CloneGroup.
  210. static void splitCloneGroups(
  211. std::vector<CloneDetector::CloneGroup> &CloneGroups,
  212. llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
  213. Compare);
  214. };
  215. /// This constraint moves clones into clone groups of type II via hashing.
  216. ///
  217. /// Clones with different hash values are moved into separate clone groups.
  218. /// Collisions are possible, and this constraint does nothing to address this
  219. /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
  220. /// constraint chain, not necessarily immediately, to eliminate hash collisions
  221. /// through a more detailed analysis.
  222. class RecursiveCloneTypeIIHashConstraint {
  223. public:
  224. void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
  225. };
  226. /// This constraint moves clones into clone groups of type II by comparing them.
  227. ///
  228. /// Clones that aren't type II clones are moved into separate clone groups.
  229. /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
  230. /// group are guaranteed to be type II clones of each other, but it is too
  231. /// slow to efficiently handle large amounts of clones.
  232. class RecursiveCloneTypeIIVerifyConstraint {
  233. public:
  234. void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
  235. };
  236. /// Ensures that every clone has at least the given complexity.
  237. ///
  238. /// Complexity is here defined as the total amount of children of a statement.
  239. /// This constraint assumes the first statement in the group is representative
  240. /// for all other statements in the group in terms of complexity.
  241. class MinComplexityConstraint {
  242. unsigned MinComplexity;
  243. public:
  244. MinComplexityConstraint(unsigned MinComplexity)
  245. : MinComplexity(MinComplexity) {}
  246. /// Calculates the complexity of the given StmtSequence.
  247. /// \param Limit The limit of complexity we probe for. After reaching
  248. /// this limit during calculation, this method is exiting
  249. /// early to improve performance and returns this limit.
  250. size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
  251. const std::string &ParentMacroStack = "");
  252. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
  253. CloneConstraint::filterGroups(
  254. CloneGroups, [this](const CloneDetector::CloneGroup &A) {
  255. if (!A.empty())
  256. return calculateStmtComplexity(A.front(), MinComplexity) <
  257. MinComplexity;
  258. else
  259. return false;
  260. });
  261. }
  262. };
  263. /// Ensures that all clone groups contain at least the given amount of clones.
  264. class MinGroupSizeConstraint {
  265. unsigned MinGroupSize;
  266. public:
  267. MinGroupSizeConstraint(unsigned MinGroupSize = 2)
  268. : MinGroupSize(MinGroupSize) {}
  269. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
  270. CloneConstraint::filterGroups(CloneGroups,
  271. [this](const CloneDetector::CloneGroup &A) {
  272. return A.size() < MinGroupSize;
  273. });
  274. }
  275. };
  276. /// Ensures that no clone group fully contains another clone group.
  277. struct OnlyLargestCloneConstraint {
  278. void constrain(std::vector<CloneDetector::CloneGroup> &Result);
  279. };
  280. struct FilenamePatternConstraint {
  281. StringRef IgnoredFilesPattern;
  282. std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
  283. FilenamePatternConstraint(StringRef IgnoredFilesPattern)
  284. : IgnoredFilesPattern(IgnoredFilesPattern) {
  285. IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
  286. IgnoredFilesPattern.str() + "$)");
  287. }
  288. bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
  289. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
  290. CloneConstraint::filterGroups(
  291. CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
  292. return isAutoGenerated(Group);
  293. });
  294. }
  295. };
  296. /// Analyzes the pattern of the referenced variables in a statement.
  297. class VariablePattern {
  298. /// Describes an occurrence of a variable reference in a statement.
  299. struct VariableOccurence {
  300. /// The index of the associated VarDecl in the Variables vector.
  301. size_t KindID;
  302. /// The statement in the code where the variable was referenced.
  303. const Stmt *Mention;
  304. VariableOccurence(size_t KindID, const Stmt *Mention)
  305. : KindID(KindID), Mention(Mention) {}
  306. };
  307. /// All occurrences of referenced variables in the order of appearance.
  308. std::vector<VariableOccurence> Occurences;
  309. /// List of referenced variables in the order of appearance.
  310. /// Every item in this list is unique.
  311. std::vector<const VarDecl *> Variables;
  312. /// Adds a new variable referenced to this pattern.
  313. /// \param VarDecl The declaration of the variable that is referenced.
  314. /// \param Mention The SourceRange where this variable is referenced.
  315. void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
  316. /// Adds each referenced variable from the given statement.
  317. void addVariables(const Stmt *S);
  318. public:
  319. /// Creates an VariablePattern object with information about the given
  320. /// StmtSequence.
  321. VariablePattern(const StmtSequence &Sequence) {
  322. for (const Stmt *S : Sequence)
  323. addVariables(S);
  324. }
  325. /// Describes two clones that reference their variables in a different pattern
  326. /// which could indicate a programming error.
  327. struct SuspiciousClonePair {
  328. /// Utility class holding the relevant information about a single
  329. /// clone in this pair.
  330. struct SuspiciousCloneInfo {
  331. /// The variable which referencing in this clone was against the pattern.
  332. const VarDecl *Variable;
  333. /// Where the variable was referenced.
  334. const Stmt *Mention;
  335. /// The variable that should have been referenced to follow the pattern.
  336. /// If Suggestion is a nullptr then it's not possible to fix the pattern
  337. /// by referencing a different variable in this clone.
  338. const VarDecl *Suggestion;
  339. SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
  340. const VarDecl *Suggestion)
  341. : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
  342. SuspiciousCloneInfo() {}
  343. };
  344. /// The first clone in the pair which always has a suggested variable.
  345. SuspiciousCloneInfo FirstCloneInfo;
  346. /// This other clone in the pair which can have a suggested variable.
  347. SuspiciousCloneInfo SecondCloneInfo;
  348. };
  349. /// Counts the differences between this pattern and the given one.
  350. /// \param Other The given VariablePattern to compare with.
  351. /// \param FirstMismatch Output parameter that will be filled with information
  352. /// about the first difference between the two patterns. This parameter
  353. /// can be a nullptr, in which case it will be ignored.
  354. /// \return Returns the number of differences between the pattern this object
  355. /// is following and the given VariablePattern.
  356. ///
  357. /// For example, the following statements all have the same pattern and this
  358. /// function would return zero:
  359. ///
  360. /// if (a < b) return a; return b;
  361. /// if (x < y) return x; return y;
  362. /// if (u2 < u1) return u2; return u1;
  363. ///
  364. /// But the following statement has a different pattern (note the changed
  365. /// variables in the return statements) and would have two differences when
  366. /// compared with one of the statements above.
  367. ///
  368. /// if (a < b) return b; return a;
  369. ///
  370. /// This function should only be called if the related statements of the given
  371. /// pattern and the statements of this objects are clones of each other.
  372. unsigned countPatternDifferences(
  373. const VariablePattern &Other,
  374. VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
  375. };
  376. /// Ensures that all clones reference variables in the same pattern.
  377. struct MatchingVariablePatternConstraint {
  378. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
  379. };
  380. } // end namespace clang
  381. #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
  382. #ifdef __GNUC__
  383. #pragma GCC diagnostic pop
  384. #endif