CloneDetection.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. ///
  14. /// \file
  15. /// This file defines classes for searching and analyzing source code clones.
  16. ///
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
  19. #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
  20. #include "clang/AST/StmtVisitor.h"
  21. #include "llvm/Support/Regex.h"
  22. #include <vector>
  23. namespace clang {
  24. class Stmt;
  25. class Decl;
  26. class VarDecl;
  27. class ASTContext;
  28. class CompoundStmt;
  29. /// Identifies a list of statements.
  30. ///
  31. /// Can either identify a single arbitrary Stmt object, a continuous sequence of
  32. /// child statements inside a CompoundStmt or no statements at all.
  33. class StmtSequence {
  34. /// If this object identifies a sequence of statements inside a CompoundStmt,
  35. /// S points to this CompoundStmt. If this object only identifies a single
  36. /// Stmt, then S is a pointer to this Stmt.
  37. const Stmt *S;
  38. /// The declaration that contains the statements.
  39. const Decl *D;
  40. /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
  41. /// instance is representing the CompoundStmt children inside the array
  42. /// [StartIndex, EndIndex).
  43. unsigned StartIndex;
  44. unsigned EndIndex;
  45. public:
  46. /// Constructs a StmtSequence holding multiple statements.
  47. ///
  48. /// The resulting StmtSequence identifies a continuous sequence of statements
  49. /// in the body of the given CompoundStmt. Which statements of the body should
  50. /// be identified needs to be specified by providing a start and end index
  51. /// that describe a non-empty sub-array in the body of the given CompoundStmt.
  52. ///
  53. /// \param Stmt A CompoundStmt that contains all statements in its body.
  54. /// \param D The Decl containing this Stmt.
  55. /// \param StartIndex The inclusive start index in the children array of
  56. /// \p Stmt
  57. /// \param EndIndex The exclusive end index in the children array of \p Stmt.
  58. StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
  59. unsigned EndIndex);
  60. /// Constructs a StmtSequence holding a single statement.
  61. ///
  62. /// \param Stmt An arbitrary Stmt.
  63. /// \param D The Decl containing this Stmt.
  64. StmtSequence(const Stmt *Stmt, const Decl *D);
  65. /// Constructs an empty StmtSequence.
  66. StmtSequence();
  67. typedef const Stmt *const *iterator;
  68. /// Returns an iterator pointing to the first statement in this sequence.
  69. iterator begin() const;
  70. /// Returns an iterator pointing behind the last statement in this sequence.
  71. iterator end() const;
  72. /// Returns the first statement in this sequence.
  73. ///
  74. /// This method should only be called on a non-empty StmtSequence object.
  75. const Stmt *front() const {
  76. assert(!empty());
  77. return begin()[0];
  78. }
  79. /// Returns the last statement in this sequence.
  80. ///
  81. /// This method should only be called on a non-empty StmtSequence object.
  82. const Stmt *back() const {
  83. assert(!empty());
  84. return begin()[size() - 1];
  85. }
  86. /// Returns the number of statements this object holds.
  87. unsigned size() const {
  88. if (holdsSequence())
  89. return EndIndex - StartIndex;
  90. if (S == nullptr)
  91. return 0;
  92. return 1;
  93. }
  94. /// Returns true if and only if this StmtSequence contains no statements.
  95. bool empty() const { return size() == 0; }
  96. /// Returns the related ASTContext for the stored Stmts.
  97. ASTContext &getASTContext() const;
  98. /// Returns the declaration that contains the stored Stmts.
  99. const Decl *getContainingDecl() const {
  100. assert(D);
  101. return D;
  102. }
  103. /// Returns true if this objects holds a list of statements.
  104. bool holdsSequence() const { return EndIndex != 0; }
  105. /// Returns the start sourcelocation of the first statement in this sequence.
  106. ///
  107. /// This method should only be called on a non-empty StmtSequence object.
  108. SourceLocation getBeginLoc() const;
  109. /// Returns the end sourcelocation of the last statement in this sequence.
  110. ///
  111. /// This method should only be called on a non-empty StmtSequence object.
  112. SourceLocation getEndLoc() const;
  113. /// Returns the source range of the whole sequence - from the beginning
  114. /// of the first statement to the end of the last statement.
  115. SourceRange getSourceRange() const;
  116. bool operator==(const StmtSequence &Other) const {
  117. return std::tie(S, StartIndex, EndIndex) ==
  118. std::tie(Other.S, Other.StartIndex, Other.EndIndex);
  119. }
  120. bool operator!=(const StmtSequence &Other) const {
  121. return std::tie(S, StartIndex, EndIndex) !=
  122. std::tie(Other.S, Other.StartIndex, Other.EndIndex);
  123. }
  124. /// Returns true if and only if this sequence covers a source range that
  125. /// contains the source range of the given sequence \p Other.
  126. ///
  127. /// This method should only be called on a non-empty StmtSequence object
  128. /// and passed a non-empty StmtSequence object.
  129. bool contains(const StmtSequence &Other) const;
  130. };
  131. /// Searches for similar subtrees in the AST.
  132. ///
  133. /// First, this class needs several declarations with statement bodies which
  134. /// can be passed via analyzeCodeBody. Afterwards all statements can be
  135. /// searched for clones by calling findClones with a given list of constraints
  136. /// that should specify the wanted properties of the clones.
  137. ///
  138. /// The result of findClones can be further constrained with the constrainClones
  139. /// method.
  140. ///
  141. /// This class only searches for clones in executable source code
  142. /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
  143. /// are not supported.
  144. class CloneDetector {
  145. public:
  146. /// A collection of StmtSequences that share an arbitrary property.
  147. typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
  148. /// Generates and stores search data for all statements in the body of
  149. /// the given Decl.
  150. void analyzeCodeBody(const Decl *D);
  151. /// Constrains the given list of clone groups with the given constraint.
  152. ///
  153. /// The constraint is expected to have a method with the signature
  154. /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
  155. /// as this is the interface that the CloneDetector uses for applying the
  156. /// constraint. The constraint is supposed to directly modify the passed list
  157. /// so that all clones in the list fulfill the specific property this
  158. /// constraint ensures.
  159. template <typename T>
  160. static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
  161. C.constrain(CloneGroups);
  162. }
  163. /// Constrains the given list of clone groups with the given list of
  164. /// constraints.
  165. ///
  166. /// The constraints are applied in sequence in the order in which they are
  167. /// passed to this function.
  168. template <typename T1, typename... Ts>
  169. static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
  170. Ts... ConstraintList) {
  171. constrainClones(CloneGroups, C);
  172. constrainClones(CloneGroups, ConstraintList...);
  173. }
  174. /// Searches for clones in all previously passed statements.
  175. /// \param Result Output parameter to which all created clone groups are
  176. /// added.
  177. /// \param ConstraintList The constraints that should be applied to the
  178. // result.
  179. template <typename... Ts>
  180. void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
  181. // The initial assumption is that there is only one clone group and every
  182. // statement is a clone of the others. This clone group will then be
  183. // split up with the help of the constraints.
  184. CloneGroup AllClones;
  185. AllClones.reserve(Sequences.size());
  186. for (const auto &C : Sequences) {
  187. AllClones.push_back(C);
  188. }
  189. Result.push_back(AllClones);
  190. constrainClones(Result, ConstraintList...);
  191. }
  192. private:
  193. CloneGroup Sequences;
  194. };
  195. /// This class is a utility class that contains utility functions for building
  196. /// custom constraints.
  197. class CloneConstraint {
  198. public:
  199. /// Removes all groups by using a filter function.
  200. /// \param CloneGroups The list of CloneGroups that is supposed to be
  201. /// filtered.
  202. /// \param Filter The filter function that should return true for all groups
  203. /// that should be removed from the list.
  204. static void filterGroups(
  205. std::vector<CloneDetector::CloneGroup> &CloneGroups,
  206. llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
  207. llvm::erase_if(CloneGroups, Filter);
  208. }
  209. /// Splits the given CloneGroups until the given Compare function returns true
  210. /// for all clones in a single group.
  211. /// \param CloneGroups A list of CloneGroups that should be modified.
  212. /// \param Compare The comparison function that all clones are supposed to
  213. /// pass. Should return true if and only if two clones belong
  214. /// to the same CloneGroup.
  215. static void splitCloneGroups(
  216. std::vector<CloneDetector::CloneGroup> &CloneGroups,
  217. llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
  218. Compare);
  219. };
  220. /// This constraint moves clones into clone groups of type II via hashing.
  221. ///
  222. /// Clones with different hash values are moved into separate clone groups.
  223. /// Collisions are possible, and this constraint does nothing to address this
  224. /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
  225. /// constraint chain, not necessarily immediately, to eliminate hash collisions
  226. /// through a more detailed analysis.
  227. class RecursiveCloneTypeIIHashConstraint {
  228. public:
  229. void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
  230. };
  231. /// This constraint moves clones into clone groups of type II by comparing them.
  232. ///
  233. /// Clones that aren't type II clones are moved into separate clone groups.
  234. /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
  235. /// group are guaranteed to be be type II clones of each other, but it is too
  236. /// slow to efficiently handle large amounts of clones.
  237. class RecursiveCloneTypeIIVerifyConstraint {
  238. public:
  239. void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
  240. };
  241. /// Ensures that every clone has at least the given complexity.
  242. ///
  243. /// Complexity is here defined as the total amount of children of a statement.
  244. /// This constraint assumes the first statement in the group is representative
  245. /// for all other statements in the group in terms of complexity.
  246. class MinComplexityConstraint {
  247. unsigned MinComplexity;
  248. public:
  249. MinComplexityConstraint(unsigned MinComplexity)
  250. : MinComplexity(MinComplexity) {}
  251. /// Calculates the complexity of the given StmtSequence.
  252. /// \param Limit The limit of complexity we probe for. After reaching
  253. /// this limit during calculation, this method is exiting
  254. /// early to improve performance and returns this limit.
  255. size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
  256. const std::string &ParentMacroStack = "");
  257. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
  258. CloneConstraint::filterGroups(
  259. CloneGroups, [this](const CloneDetector::CloneGroup &A) {
  260. if (!A.empty())
  261. return calculateStmtComplexity(A.front(), MinComplexity) <
  262. MinComplexity;
  263. else
  264. return false;
  265. });
  266. }
  267. };
  268. /// Ensures that all clone groups contain at least the given amount of clones.
  269. class MinGroupSizeConstraint {
  270. unsigned MinGroupSize;
  271. public:
  272. MinGroupSizeConstraint(unsigned MinGroupSize = 2)
  273. : MinGroupSize(MinGroupSize) {}
  274. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
  275. CloneConstraint::filterGroups(CloneGroups,
  276. [this](const CloneDetector::CloneGroup &A) {
  277. return A.size() < MinGroupSize;
  278. });
  279. }
  280. };
  281. /// Ensures that no clone group fully contains another clone group.
  282. struct OnlyLargestCloneConstraint {
  283. void constrain(std::vector<CloneDetector::CloneGroup> &Result);
  284. };
  285. struct FilenamePatternConstraint {
  286. StringRef IgnoredFilesPattern;
  287. std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
  288. FilenamePatternConstraint(StringRef IgnoredFilesPattern)
  289. : IgnoredFilesPattern(IgnoredFilesPattern) {
  290. IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
  291. IgnoredFilesPattern.str() + "$)");
  292. }
  293. bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
  294. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
  295. CloneConstraint::filterGroups(
  296. CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
  297. return isAutoGenerated(Group);
  298. });
  299. }
  300. };
  301. /// Analyzes the pattern of the referenced variables in a statement.
  302. class VariablePattern {
  303. /// Describes an occurrence of a variable reference in a statement.
  304. struct VariableOccurence {
  305. /// The index of the associated VarDecl in the Variables vector.
  306. size_t KindID;
  307. /// The statement in the code where the variable was referenced.
  308. const Stmt *Mention;
  309. VariableOccurence(size_t KindID, const Stmt *Mention)
  310. : KindID(KindID), Mention(Mention) {}
  311. };
  312. /// All occurrences of referenced variables in the order of appearance.
  313. std::vector<VariableOccurence> Occurences;
  314. /// List of referenced variables in the order of appearance.
  315. /// Every item in this list is unique.
  316. std::vector<const VarDecl *> Variables;
  317. /// Adds a new variable referenced to this pattern.
  318. /// \param VarDecl The declaration of the variable that is referenced.
  319. /// \param Mention The SourceRange where this variable is referenced.
  320. void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
  321. /// Adds each referenced variable from the given statement.
  322. void addVariables(const Stmt *S);
  323. public:
  324. /// Creates an VariablePattern object with information about the given
  325. /// StmtSequence.
  326. VariablePattern(const StmtSequence &Sequence) {
  327. for (const Stmt *S : Sequence)
  328. addVariables(S);
  329. }
  330. /// Describes two clones that reference their variables in a different pattern
  331. /// which could indicate a programming error.
  332. struct SuspiciousClonePair {
  333. /// Utility class holding the relevant information about a single
  334. /// clone in this pair.
  335. struct SuspiciousCloneInfo {
  336. /// The variable which referencing in this clone was against the pattern.
  337. const VarDecl *Variable;
  338. /// Where the variable was referenced.
  339. const Stmt *Mention;
  340. /// The variable that should have been referenced to follow the pattern.
  341. /// If Suggestion is a nullptr then it's not possible to fix the pattern
  342. /// by referencing a different variable in this clone.
  343. const VarDecl *Suggestion;
  344. SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
  345. const VarDecl *Suggestion)
  346. : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
  347. SuspiciousCloneInfo() {}
  348. };
  349. /// The first clone in the pair which always has a suggested variable.
  350. SuspiciousCloneInfo FirstCloneInfo;
  351. /// This other clone in the pair which can have a suggested variable.
  352. SuspiciousCloneInfo SecondCloneInfo;
  353. };
  354. /// Counts the differences between this pattern and the given one.
  355. /// \param Other The given VariablePattern to compare with.
  356. /// \param FirstMismatch Output parameter that will be filled with information
  357. /// about the first difference between the two patterns. This parameter
  358. /// can be a nullptr, in which case it will be ignored.
  359. /// \return Returns the number of differences between the pattern this object
  360. /// is following and the given VariablePattern.
  361. ///
  362. /// For example, the following statements all have the same pattern and this
  363. /// function would return zero:
  364. ///
  365. /// if (a < b) return a; return b;
  366. /// if (x < y) return x; return y;
  367. /// if (u2 < u1) return u2; return u1;
  368. ///
  369. /// But the following statement has a different pattern (note the changed
  370. /// variables in the return statements) and would have two differences when
  371. /// compared with one of the statements above.
  372. ///
  373. /// if (a < b) return b; return a;
  374. ///
  375. /// This function should only be called if the related statements of the given
  376. /// pattern and the statements of this objects are clones of each other.
  377. unsigned countPatternDifferences(
  378. const VariablePattern &Other,
  379. VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
  380. };
  381. /// Ensures that all clones reference variables in the same pattern.
  382. struct MatchingVariablePatternConstraint {
  383. void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
  384. };
  385. } // end namespace clang
  386. #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
  387. #ifdef __GNUC__
  388. #pragma GCC diagnostic pop
  389. #endif