123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455 |
- #pragma once
- #ifdef __GNUC__
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wunused-parameter"
- #endif
- //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- ///
- /// \file
- /// This file defines classes for searching and analyzing source code clones.
- ///
- //===----------------------------------------------------------------------===//
- #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
- #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
- #include "clang/AST/StmtVisitor.h"
- #include "llvm/Support/Regex.h"
- #include <vector>
- namespace clang {
- class Stmt;
- class Decl;
- class VarDecl;
- class ASTContext;
- class CompoundStmt;
- /// Identifies a list of statements.
- ///
- /// Can either identify a single arbitrary Stmt object, a continuous sequence of
- /// child statements inside a CompoundStmt or no statements at all.
- class StmtSequence {
- /// If this object identifies a sequence of statements inside a CompoundStmt,
- /// S points to this CompoundStmt. If this object only identifies a single
- /// Stmt, then S is a pointer to this Stmt.
- const Stmt *S;
- /// The declaration that contains the statements.
- const Decl *D;
- /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
- /// instance is representing the CompoundStmt children inside the array
- /// [StartIndex, EndIndex).
- unsigned StartIndex;
- unsigned EndIndex;
- public:
- /// Constructs a StmtSequence holding multiple statements.
- ///
- /// The resulting StmtSequence identifies a continuous sequence of statements
- /// in the body of the given CompoundStmt. Which statements of the body should
- /// be identified needs to be specified by providing a start and end index
- /// that describe a non-empty sub-array in the body of the given CompoundStmt.
- ///
- /// \param Stmt A CompoundStmt that contains all statements in its body.
- /// \param D The Decl containing this Stmt.
- /// \param StartIndex The inclusive start index in the children array of
- /// \p Stmt
- /// \param EndIndex The exclusive end index in the children array of \p Stmt.
- StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
- unsigned EndIndex);
- /// Constructs a StmtSequence holding a single statement.
- ///
- /// \param Stmt An arbitrary Stmt.
- /// \param D The Decl containing this Stmt.
- StmtSequence(const Stmt *Stmt, const Decl *D);
- /// Constructs an empty StmtSequence.
- StmtSequence();
- typedef const Stmt *const *iterator;
- /// Returns an iterator pointing to the first statement in this sequence.
- iterator begin() const;
- /// Returns an iterator pointing behind the last statement in this sequence.
- iterator end() const;
- /// Returns the first statement in this sequence.
- ///
- /// This method should only be called on a non-empty StmtSequence object.
- const Stmt *front() const {
- assert(!empty());
- return begin()[0];
- }
- /// Returns the last statement in this sequence.
- ///
- /// This method should only be called on a non-empty StmtSequence object.
- const Stmt *back() const {
- assert(!empty());
- return begin()[size() - 1];
- }
- /// Returns the number of statements this object holds.
- unsigned size() const {
- if (holdsSequence())
- return EndIndex - StartIndex;
- if (S == nullptr)
- return 0;
- return 1;
- }
- /// Returns true if and only if this StmtSequence contains no statements.
- bool empty() const { return size() == 0; }
- /// Returns the related ASTContext for the stored Stmts.
- ASTContext &getASTContext() const;
- /// Returns the declaration that contains the stored Stmts.
- const Decl *getContainingDecl() const {
- assert(D);
- return D;
- }
- /// Returns true if this objects holds a list of statements.
- bool holdsSequence() const { return EndIndex != 0; }
- /// Returns the start sourcelocation of the first statement in this sequence.
- ///
- /// This method should only be called on a non-empty StmtSequence object.
- SourceLocation getBeginLoc() const;
- /// Returns the end sourcelocation of the last statement in this sequence.
- ///
- /// This method should only be called on a non-empty StmtSequence object.
- SourceLocation getEndLoc() const;
- /// Returns the source range of the whole sequence - from the beginning
- /// of the first statement to the end of the last statement.
- SourceRange getSourceRange() const;
- bool operator==(const StmtSequence &Other) const {
- return std::tie(S, StartIndex, EndIndex) ==
- std::tie(Other.S, Other.StartIndex, Other.EndIndex);
- }
- bool operator!=(const StmtSequence &Other) const {
- return std::tie(S, StartIndex, EndIndex) !=
- std::tie(Other.S, Other.StartIndex, Other.EndIndex);
- }
- /// Returns true if and only if this sequence covers a source range that
- /// contains the source range of the given sequence \p Other.
- ///
- /// This method should only be called on a non-empty StmtSequence object
- /// and passed a non-empty StmtSequence object.
- bool contains(const StmtSequence &Other) const;
- };
- /// Searches for similar subtrees in the AST.
- ///
- /// First, this class needs several declarations with statement bodies which
- /// can be passed via analyzeCodeBody. Afterwards all statements can be
- /// searched for clones by calling findClones with a given list of constraints
- /// that should specify the wanted properties of the clones.
- ///
- /// The result of findClones can be further constrained with the constrainClones
- /// method.
- ///
- /// This class only searches for clones in executable source code
- /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
- /// are not supported.
- class CloneDetector {
- public:
- /// A collection of StmtSequences that share an arbitrary property.
- typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
- /// Generates and stores search data for all statements in the body of
- /// the given Decl.
- void analyzeCodeBody(const Decl *D);
- /// Constrains the given list of clone groups with the given constraint.
- ///
- /// The constraint is expected to have a method with the signature
- /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
- /// as this is the interface that the CloneDetector uses for applying the
- /// constraint. The constraint is supposed to directly modify the passed list
- /// so that all clones in the list fulfill the specific property this
- /// constraint ensures.
- template <typename T>
- static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
- C.constrain(CloneGroups);
- }
- /// Constrains the given list of clone groups with the given list of
- /// constraints.
- ///
- /// The constraints are applied in sequence in the order in which they are
- /// passed to this function.
- template <typename T1, typename... Ts>
- static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
- Ts... ConstraintList) {
- constrainClones(CloneGroups, C);
- constrainClones(CloneGroups, ConstraintList...);
- }
- /// Searches for clones in all previously passed statements.
- /// \param Result Output parameter to which all created clone groups are
- /// added.
- /// \param ConstraintList The constraints that should be applied to the
- // result.
- template <typename... Ts>
- void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
- // The initial assumption is that there is only one clone group and every
- // statement is a clone of the others. This clone group will then be
- // split up with the help of the constraints.
- CloneGroup AllClones;
- AllClones.reserve(Sequences.size());
- for (const auto &C : Sequences) {
- AllClones.push_back(C);
- }
- Result.push_back(AllClones);
- constrainClones(Result, ConstraintList...);
- }
- private:
- CloneGroup Sequences;
- };
- /// This class is a utility class that contains utility functions for building
- /// custom constraints.
- class CloneConstraint {
- public:
- /// Removes all groups by using a filter function.
- /// \param CloneGroups The list of CloneGroups that is supposed to be
- /// filtered.
- /// \param Filter The filter function that should return true for all groups
- /// that should be removed from the list.
- static void filterGroups(
- std::vector<CloneDetector::CloneGroup> &CloneGroups,
- llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
- llvm::erase_if(CloneGroups, Filter);
- }
- /// Splits the given CloneGroups until the given Compare function returns true
- /// for all clones in a single group.
- /// \param CloneGroups A list of CloneGroups that should be modified.
- /// \param Compare The comparison function that all clones are supposed to
- /// pass. Should return true if and only if two clones belong
- /// to the same CloneGroup.
- static void splitCloneGroups(
- std::vector<CloneDetector::CloneGroup> &CloneGroups,
- llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
- Compare);
- };
- /// This constraint moves clones into clone groups of type II via hashing.
- ///
- /// Clones with different hash values are moved into separate clone groups.
- /// Collisions are possible, and this constraint does nothing to address this
- /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
- /// constraint chain, not necessarily immediately, to eliminate hash collisions
- /// through a more detailed analysis.
- class RecursiveCloneTypeIIHashConstraint {
- public:
- void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
- };
- /// This constraint moves clones into clone groups of type II by comparing them.
- ///
- /// Clones that aren't type II clones are moved into separate clone groups.
- /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
- /// group are guaranteed to be be type II clones of each other, but it is too
- /// slow to efficiently handle large amounts of clones.
- class RecursiveCloneTypeIIVerifyConstraint {
- public:
- void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
- };
- /// Ensures that every clone has at least the given complexity.
- ///
- /// Complexity is here defined as the total amount of children of a statement.
- /// This constraint assumes the first statement in the group is representative
- /// for all other statements in the group in terms of complexity.
- class MinComplexityConstraint {
- unsigned MinComplexity;
- public:
- MinComplexityConstraint(unsigned MinComplexity)
- : MinComplexity(MinComplexity) {}
- /// Calculates the complexity of the given StmtSequence.
- /// \param Limit The limit of complexity we probe for. After reaching
- /// this limit during calculation, this method is exiting
- /// early to improve performance and returns this limit.
- size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
- const std::string &ParentMacroStack = "");
- void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
- CloneConstraint::filterGroups(
- CloneGroups, [this](const CloneDetector::CloneGroup &A) {
- if (!A.empty())
- return calculateStmtComplexity(A.front(), MinComplexity) <
- MinComplexity;
- else
- return false;
- });
- }
- };
- /// Ensures that all clone groups contain at least the given amount of clones.
- class MinGroupSizeConstraint {
- unsigned MinGroupSize;
- public:
- MinGroupSizeConstraint(unsigned MinGroupSize = 2)
- : MinGroupSize(MinGroupSize) {}
- void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
- CloneConstraint::filterGroups(CloneGroups,
- [this](const CloneDetector::CloneGroup &A) {
- return A.size() < MinGroupSize;
- });
- }
- };
- /// Ensures that no clone group fully contains another clone group.
- struct OnlyLargestCloneConstraint {
- void constrain(std::vector<CloneDetector::CloneGroup> &Result);
- };
- struct FilenamePatternConstraint {
- StringRef IgnoredFilesPattern;
- std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
- FilenamePatternConstraint(StringRef IgnoredFilesPattern)
- : IgnoredFilesPattern(IgnoredFilesPattern) {
- IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
- IgnoredFilesPattern.str() + "$)");
- }
- bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
- void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
- CloneConstraint::filterGroups(
- CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
- return isAutoGenerated(Group);
- });
- }
- };
- /// Analyzes the pattern of the referenced variables in a statement.
- class VariablePattern {
- /// Describes an occurrence of a variable reference in a statement.
- struct VariableOccurence {
- /// The index of the associated VarDecl in the Variables vector.
- size_t KindID;
- /// The statement in the code where the variable was referenced.
- const Stmt *Mention;
- VariableOccurence(size_t KindID, const Stmt *Mention)
- : KindID(KindID), Mention(Mention) {}
- };
- /// All occurrences of referenced variables in the order of appearance.
- std::vector<VariableOccurence> Occurences;
- /// List of referenced variables in the order of appearance.
- /// Every item in this list is unique.
- std::vector<const VarDecl *> Variables;
- /// Adds a new variable referenced to this pattern.
- /// \param VarDecl The declaration of the variable that is referenced.
- /// \param Mention The SourceRange where this variable is referenced.
- void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
- /// Adds each referenced variable from the given statement.
- void addVariables(const Stmt *S);
- public:
- /// Creates an VariablePattern object with information about the given
- /// StmtSequence.
- VariablePattern(const StmtSequence &Sequence) {
- for (const Stmt *S : Sequence)
- addVariables(S);
- }
- /// Describes two clones that reference their variables in a different pattern
- /// which could indicate a programming error.
- struct SuspiciousClonePair {
- /// Utility class holding the relevant information about a single
- /// clone in this pair.
- struct SuspiciousCloneInfo {
- /// The variable which referencing in this clone was against the pattern.
- const VarDecl *Variable;
- /// Where the variable was referenced.
- const Stmt *Mention;
- /// The variable that should have been referenced to follow the pattern.
- /// If Suggestion is a nullptr then it's not possible to fix the pattern
- /// by referencing a different variable in this clone.
- const VarDecl *Suggestion;
- SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
- const VarDecl *Suggestion)
- : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
- SuspiciousCloneInfo() {}
- };
- /// The first clone in the pair which always has a suggested variable.
- SuspiciousCloneInfo FirstCloneInfo;
- /// This other clone in the pair which can have a suggested variable.
- SuspiciousCloneInfo SecondCloneInfo;
- };
- /// Counts the differences between this pattern and the given one.
- /// \param Other The given VariablePattern to compare with.
- /// \param FirstMismatch Output parameter that will be filled with information
- /// about the first difference between the two patterns. This parameter
- /// can be a nullptr, in which case it will be ignored.
- /// \return Returns the number of differences between the pattern this object
- /// is following and the given VariablePattern.
- ///
- /// For example, the following statements all have the same pattern and this
- /// function would return zero:
- ///
- /// if (a < b) return a; return b;
- /// if (x < y) return x; return y;
- /// if (u2 < u1) return u2; return u1;
- ///
- /// But the following statement has a different pattern (note the changed
- /// variables in the return statements) and would have two differences when
- /// compared with one of the statements above.
- ///
- /// if (a < b) return b; return a;
- ///
- /// This function should only be called if the related statements of the given
- /// pattern and the statements of this objects are clones of each other.
- unsigned countPatternDifferences(
- const VariablePattern &Other,
- VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
- };
- /// Ensures that all clones reference variables in the same pattern.
- struct MatchingVariablePatternConstraint {
- void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
- };
- } // end namespace clang
- #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
- #ifdef __GNUC__
- #pragma GCC diagnostic pop
- #endif
|