123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401 |
- //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This file provides functions that simplify extraction of source code.
- //
- //===----------------------------------------------------------------------===//
- #include "clang/Tooling/Transformer/SourceCode.h"
- #include "clang/AST/ASTContext.h"
- #include "clang/AST/Attr.h"
- #include "clang/AST/Comment.h"
- #include "clang/AST/Decl.h"
- #include "clang/AST/DeclCXX.h"
- #include "clang/AST/DeclTemplate.h"
- #include "clang/AST/Expr.h"
- #include "clang/Basic/SourceManager.h"
- #include "clang/Lex/Lexer.h"
- #include "llvm/Support/Errc.h"
- #include "llvm/Support/Error.h"
- #include <set>
- using namespace clang;
- using llvm::errc;
- using llvm::StringError;
- StringRef clang::tooling::getText(CharSourceRange Range,
- const ASTContext &Context) {
- return Lexer::getSourceText(Range, Context.getSourceManager(),
- Context.getLangOpts());
- }
- CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
- tok::TokenKind Next,
- ASTContext &Context) {
- CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),
- Context.getLangOpts());
- if (R.isInvalid())
- return Range;
- Token Tok;
- bool Err =
- Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
- Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
- if (Err || !Tok.is(Next))
- return Range;
- return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
- }
- llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
- const SourceManager &SM) {
- if (Range.isInvalid())
- return llvm::make_error<StringError>(errc::invalid_argument,
- "Invalid range");
- if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
- return llvm::make_error<StringError>(
- errc::invalid_argument, "Range starts or ends in a macro expansion");
- if (SM.isInSystemHeader(Range.getBegin()) ||
- SM.isInSystemHeader(Range.getEnd()))
- return llvm::make_error<StringError>(errc::invalid_argument,
- "Range is in system header");
- std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
- std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
- if (BeginInfo.first != EndInfo.first)
- return llvm::make_error<StringError>(
- errc::invalid_argument, "Range begins and ends in different files");
- if (BeginInfo.second > EndInfo.second)
- return llvm::make_error<StringError>(
- errc::invalid_argument, "Range's begin is past its end");
- return llvm::Error::success();
- }
- llvm::Optional<CharSourceRange>
- clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
- const SourceManager &SM,
- const LangOptions &LangOpts) {
- // FIXME: makeFileCharRange() has the disadvantage of stripping off "identity"
- // macros. For example, if we're looking to rewrite the int literal 3 to 6,
- // and we have the following definition:
- // #define DO_NOTHING(x) x
- // then
- // foo(DO_NOTHING(3))
- // will be rewritten to
- // foo(6)
- // rather than the arguably better
- // foo(DO_NOTHING(6))
- // Decide whether the current behavior is desirable and modify if not.
- CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
- bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
- if (IsInvalid)
- return llvm::None;
- return Range;
- }
- static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
- return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
- }
- static bool contains(const std::set<tok::TokenKind> &Terminators,
- const Token &Tok) {
- return Terminators.count(Tok.getKind()) > 0;
- }
- // Returns the exclusive, *file* end location of the entity whose last token is
- // at location 'EntityLast'. That is, it returns the location one past the last
- // relevant character.
- //
- // Associated tokens include comments, horizontal whitespace and 'Terminators'
- // -- optional tokens, which, if any are found, will be included; if
- // 'Terminators' is empty, we will not include any extra tokens beyond comments
- // and horizontal whitespace.
- static SourceLocation
- getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
- const std::set<tok::TokenKind> &Terminators,
- const LangOptions &LangOpts) {
- assert(EntityLast.isValid() && "Invalid end location found.");
- // We remember the last location of a non-horizontal-whitespace token we have
- // lexed; this is the location up to which we will want to delete.
- // FIXME: Support using the spelling loc here for cases where we want to
- // analyze the macro text.
- CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
- // FIXME: Should check isTokenRange(), for the (rare) case that
- // `ExpansionRange` is a character range.
- std::unique_ptr<Lexer> Lexer = [&]() {
- bool Invalid = false;
- auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
- llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
- assert(!Invalid && "Cannot get file/offset");
- return std::make_unique<clang::Lexer>(
- SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
- File.data() + FileOffset.second, File.end());
- }();
- // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
- Lexer->SetKeepWhitespaceMode(true);
- // Generally, the code we want to include looks like this ([] are optional),
- // If Terminators is empty:
- // [ <comment> ] [ <newline> ]
- // Otherwise:
- // ... <terminator> [ <comment> ] [ <newline> ]
- Token Tok;
- bool Terminated = false;
- // First, lex to the current token (which is the last token of the range that
- // is definitely associated with the decl). Then, we process the first token
- // separately from the rest based on conditions that hold specifically for
- // that first token.
- //
- // We do not search for a terminator if none is required or we've already
- // encountered it. Otherwise, if the original `EntityLast` location was in a
- // macro expansion, we don't have visibility into the text, so we assume we've
- // already terminated. However, we note this assumption with
- // `TerminatedByMacro`, because we'll want to handle it somewhat differently
- // for the terminators semicolon and comma. These terminators can be safely
- // associated with the entity when they appear after the macro -- extra
- // semicolons have no effect on the program and a well-formed program won't
- // have multiple commas in a row, so we're guaranteed that there is only one.
- //
- // FIXME: This handling of macros is more conservative than necessary. When
- // the end of the expansion coincides with the end of the node, we can still
- // safely analyze the code. But, it is more complicated, because we need to
- // start by lexing the spelling loc for the first token and then switch to the
- // expansion loc.
- bool TerminatedByMacro = false;
- Lexer->LexFromRawLexer(Tok);
- if (Terminators.empty() || contains(Terminators, Tok))
- Terminated = true;
- else if (EntityLast.isMacroID()) {
- Terminated = true;
- TerminatedByMacro = true;
- }
- // We save the most recent candidate for the exclusive end location.
- SourceLocation End = Tok.getEndLoc();
- while (!Terminated) {
- // Lex the next token we want to possibly expand the range with.
- Lexer->LexFromRawLexer(Tok);
- switch (Tok.getKind()) {
- case tok::eof:
- // Unexpected separators.
- case tok::l_brace:
- case tok::r_brace:
- case tok::comma:
- return End;
- // Whitespace pseudo-tokens.
- case tok::unknown:
- if (startsWithNewline(SM, Tok))
- // Include at least until the end of the line.
- End = Tok.getEndLoc();
- break;
- default:
- if (contains(Terminators, Tok))
- Terminated = true;
- End = Tok.getEndLoc();
- break;
- }
- }
- do {
- // Lex the next token we want to possibly expand the range with.
- Lexer->LexFromRawLexer(Tok);
- switch (Tok.getKind()) {
- case tok::unknown:
- if (startsWithNewline(SM, Tok))
- // We're done, but include this newline.
- return Tok.getEndLoc();
- break;
- case tok::comment:
- // Include any comments we find on the way.
- End = Tok.getEndLoc();
- break;
- case tok::semi:
- case tok::comma:
- if (TerminatedByMacro && contains(Terminators, Tok)) {
- End = Tok.getEndLoc();
- // We've found a real terminator.
- TerminatedByMacro = false;
- break;
- }
- // Found an unrelated token; stop and don't include it.
- return End;
- default:
- // Found an unrelated token; stop and don't include it.
- return End;
- }
- } while (true);
- }
- // Returns the expected terminator tokens for the given declaration.
- //
- // If we do not know the correct terminator token, returns an empty set.
- //
- // There are cases where we have more than one possible terminator (for example,
- // we find either a comma or a semicolon after a VarDecl).
- static std::set<tok::TokenKind> getTerminators(const Decl &D) {
- if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
- return {tok::semi};
- if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
- return {tok::r_brace, tok::semi};
- if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
- return {tok::comma, tok::semi};
- return {};
- }
- // Starting from `Loc`, skips whitespace up to, and including, a single
- // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
- // location immediately after the whitespace).
- static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
- SourceLocation Loc,
- const LangOptions &LangOpts) {
- const char *LocChars = SM.getCharacterData(Loc);
- int i = 0;
- while (isHorizontalWhitespace(LocChars[i]))
- ++i;
- if (isVerticalWhitespace(LocChars[i]))
- ++i;
- return Loc.getLocWithOffset(i);
- }
- // Is `Loc` separated from any following decl by something meaningful (e.g. an
- // empty line, a comment), ignoring horizontal whitespace? Since this is a
- // heuristic, we return false when in doubt. `Loc` cannot be the first location
- // in the file.
- static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
- const LangOptions &LangOpts) {
- // If the preceding character is a newline, we'll check for an empty line as a
- // separator. However, we can't identify an empty line using tokens, so we
- // analyse the characters. If we try to use tokens, we'll just end up with a
- // whitespace token, whose characters we'd have to analyse anyhow.
- bool Invalid = false;
- const char *LocChars =
- SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
- assert(!Invalid &&
- "Loc must be a valid character and not the first of the source file.");
- if (isVerticalWhitespace(LocChars[0])) {
- for (int i = 1; isWhitespace(LocChars[i]); ++i)
- if (isVerticalWhitespace(LocChars[i]))
- return true;
- }
- // We didn't find an empty line, so lex the next token, skipping past any
- // whitespace we just scanned.
- Token Tok;
- bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
- /*IgnoreWhiteSpace=*/true);
- if (Failed)
- // Any text that confuses the lexer seems fair to consider a separation.
- return true;
- switch (Tok.getKind()) {
- case tok::comment:
- case tok::l_brace:
- case tok::r_brace:
- case tok::eof:
- return true;
- default:
- return false;
- }
- }
- CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
- ASTContext &Context) {
- const SourceManager &SM = Context.getSourceManager();
- const LangOptions &LangOpts = Context.getLangOpts();
- CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
- // First, expand to the start of the template<> declaration if necessary.
- if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
- if (const auto *T = Record->getDescribedClassTemplate())
- if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
- Range.setBegin(T->getBeginLoc());
- } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
- if (const auto *T = F->getDescribedFunctionTemplate())
- if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
- Range.setBegin(T->getBeginLoc());
- }
- // Next, expand the end location past trailing comments to include a potential
- // newline at the end of the decl's line.
- Range.setEnd(
- getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
- Range.setTokenRange(false);
- // Expand to include preceeding associated comments. We ignore any comments
- // that are not preceeding the decl, since we've already skipped trailing
- // comments with getEntityEndLoc.
- if (const RawComment *Comment =
- Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
- // Only include a preceding comment if:
- // * it is *not* separate from the declaration (not including any newline
- // that immediately follows the comment),
- // * the decl *is* separate from any following entity (so, there are no
- // other entities the comment could refer to), and
- // * it is not a IfThisThenThat lint check.
- if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
- Range.getBegin()) &&
- !atOrBeforeSeparation(
- SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
- LangOpts) &&
- atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
- const StringRef CommentText = Comment->getRawText(SM);
- if (!CommentText.contains("LINT.IfChange") &&
- !CommentText.contains("LINT.ThenChange"))
- Range.setBegin(Comment->getBeginLoc());
- }
- // Add leading attributes.
- for (auto *Attr : Decl.attrs()) {
- if (Attr->getLocation().isInvalid() ||
- !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
- continue;
- Range.setBegin(Attr->getLocation());
- // Extend to the left '[[' or '__attribute((' if we saw the attribute,
- // unless it is not a valid location.
- bool Invalid;
- StringRef Source =
- SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
- if (Invalid)
- continue;
- llvm::StringRef BeforeAttr =
- Source.substr(0, SM.getFileOffset(Range.getBegin()));
- llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
- for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
- // Handle whitespace between attribute prefix and attribute value.
- if (BeforeAttrStripped.endswith(Prefix)) {
- // Move start to start position of prefix, which is
- // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
- // positions to the left.
- Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
- -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
- break;
- // If we didn't see '[[' or '__attribute' it's probably coming from a
- // macro expansion which is already handled by makeFileCharRange(),
- // below.
- }
- }
- }
- // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
- // Range.getBegin() may be inside an expansion.
- return Lexer::makeFileCharRange(Range, SM, LangOpts);
- }
|