SourceCode.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file provides functions that simplify extraction of source code.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "clang/Tooling/Transformer/SourceCode.h"
  13. #include "clang/AST/ASTContext.h"
  14. #include "clang/AST/Attr.h"
  15. #include "clang/AST/Comment.h"
  16. #include "clang/AST/Decl.h"
  17. #include "clang/AST/DeclCXX.h"
  18. #include "clang/AST/DeclTemplate.h"
  19. #include "clang/AST/Expr.h"
  20. #include "clang/Basic/SourceManager.h"
  21. #include "clang/Lex/Lexer.h"
  22. #include "llvm/Support/Errc.h"
  23. #include "llvm/Support/Error.h"
  24. #include <set>
  25. using namespace clang;
  26. using llvm::errc;
  27. using llvm::StringError;
  28. StringRef clang::tooling::getText(CharSourceRange Range,
  29. const ASTContext &Context) {
  30. return Lexer::getSourceText(Range, Context.getSourceManager(),
  31. Context.getLangOpts());
  32. }
  33. CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
  34. tok::TokenKind Next,
  35. ASTContext &Context) {
  36. CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),
  37. Context.getLangOpts());
  38. if (R.isInvalid())
  39. return Range;
  40. Token Tok;
  41. bool Err =
  42. Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
  43. Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
  44. if (Err || !Tok.is(Next))
  45. return Range;
  46. return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
  47. }
  48. llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
  49. const SourceManager &SM) {
  50. if (Range.isInvalid())
  51. return llvm::make_error<StringError>(errc::invalid_argument,
  52. "Invalid range");
  53. if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
  54. return llvm::make_error<StringError>(
  55. errc::invalid_argument, "Range starts or ends in a macro expansion");
  56. if (SM.isInSystemHeader(Range.getBegin()) ||
  57. SM.isInSystemHeader(Range.getEnd()))
  58. return llvm::make_error<StringError>(errc::invalid_argument,
  59. "Range is in system header");
  60. std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
  61. std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
  62. if (BeginInfo.first != EndInfo.first)
  63. return llvm::make_error<StringError>(
  64. errc::invalid_argument, "Range begins and ends in different files");
  65. if (BeginInfo.second > EndInfo.second)
  66. return llvm::make_error<StringError>(
  67. errc::invalid_argument, "Range's begin is past its end");
  68. return llvm::Error::success();
  69. }
  70. llvm::Optional<CharSourceRange>
  71. clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
  72. const SourceManager &SM,
  73. const LangOptions &LangOpts) {
  74. // FIXME: makeFileCharRange() has the disadvantage of stripping off "identity"
  75. // macros. For example, if we're looking to rewrite the int literal 3 to 6,
  76. // and we have the following definition:
  77. // #define DO_NOTHING(x) x
  78. // then
  79. // foo(DO_NOTHING(3))
  80. // will be rewritten to
  81. // foo(6)
  82. // rather than the arguably better
  83. // foo(DO_NOTHING(6))
  84. // Decide whether the current behavior is desirable and modify if not.
  85. CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
  86. bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
  87. if (IsInvalid)
  88. return llvm::None;
  89. return Range;
  90. }
  91. static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
  92. return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
  93. }
  94. static bool contains(const std::set<tok::TokenKind> &Terminators,
  95. const Token &Tok) {
  96. return Terminators.count(Tok.getKind()) > 0;
  97. }
  98. // Returns the exclusive, *file* end location of the entity whose last token is
  99. // at location 'EntityLast'. That is, it returns the location one past the last
  100. // relevant character.
  101. //
  102. // Associated tokens include comments, horizontal whitespace and 'Terminators'
  103. // -- optional tokens, which, if any are found, will be included; if
  104. // 'Terminators' is empty, we will not include any extra tokens beyond comments
  105. // and horizontal whitespace.
  106. static SourceLocation
  107. getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
  108. const std::set<tok::TokenKind> &Terminators,
  109. const LangOptions &LangOpts) {
  110. assert(EntityLast.isValid() && "Invalid end location found.");
  111. // We remember the last location of a non-horizontal-whitespace token we have
  112. // lexed; this is the location up to which we will want to delete.
  113. // FIXME: Support using the spelling loc here for cases where we want to
  114. // analyze the macro text.
  115. CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
  116. // FIXME: Should check isTokenRange(), for the (rare) case that
  117. // `ExpansionRange` is a character range.
  118. std::unique_ptr<Lexer> Lexer = [&]() {
  119. bool Invalid = false;
  120. auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
  121. llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
  122. assert(!Invalid && "Cannot get file/offset");
  123. return std::make_unique<clang::Lexer>(
  124. SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
  125. File.data() + FileOffset.second, File.end());
  126. }();
  127. // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
  128. Lexer->SetKeepWhitespaceMode(true);
  129. // Generally, the code we want to include looks like this ([] are optional),
  130. // If Terminators is empty:
  131. // [ <comment> ] [ <newline> ]
  132. // Otherwise:
  133. // ... <terminator> [ <comment> ] [ <newline> ]
  134. Token Tok;
  135. bool Terminated = false;
  136. // First, lex to the current token (which is the last token of the range that
  137. // is definitely associated with the decl). Then, we process the first token
  138. // separately from the rest based on conditions that hold specifically for
  139. // that first token.
  140. //
  141. // We do not search for a terminator if none is required or we've already
  142. // encountered it. Otherwise, if the original `EntityLast` location was in a
  143. // macro expansion, we don't have visibility into the text, so we assume we've
  144. // already terminated. However, we note this assumption with
  145. // `TerminatedByMacro`, because we'll want to handle it somewhat differently
  146. // for the terminators semicolon and comma. These terminators can be safely
  147. // associated with the entity when they appear after the macro -- extra
  148. // semicolons have no effect on the program and a well-formed program won't
  149. // have multiple commas in a row, so we're guaranteed that there is only one.
  150. //
  151. // FIXME: This handling of macros is more conservative than necessary. When
  152. // the end of the expansion coincides with the end of the node, we can still
  153. // safely analyze the code. But, it is more complicated, because we need to
  154. // start by lexing the spelling loc for the first token and then switch to the
  155. // expansion loc.
  156. bool TerminatedByMacro = false;
  157. Lexer->LexFromRawLexer(Tok);
  158. if (Terminators.empty() || contains(Terminators, Tok))
  159. Terminated = true;
  160. else if (EntityLast.isMacroID()) {
  161. Terminated = true;
  162. TerminatedByMacro = true;
  163. }
  164. // We save the most recent candidate for the exclusive end location.
  165. SourceLocation End = Tok.getEndLoc();
  166. while (!Terminated) {
  167. // Lex the next token we want to possibly expand the range with.
  168. Lexer->LexFromRawLexer(Tok);
  169. switch (Tok.getKind()) {
  170. case tok::eof:
  171. // Unexpected separators.
  172. case tok::l_brace:
  173. case tok::r_brace:
  174. case tok::comma:
  175. return End;
  176. // Whitespace pseudo-tokens.
  177. case tok::unknown:
  178. if (startsWithNewline(SM, Tok))
  179. // Include at least until the end of the line.
  180. End = Tok.getEndLoc();
  181. break;
  182. default:
  183. if (contains(Terminators, Tok))
  184. Terminated = true;
  185. End = Tok.getEndLoc();
  186. break;
  187. }
  188. }
  189. do {
  190. // Lex the next token we want to possibly expand the range with.
  191. Lexer->LexFromRawLexer(Tok);
  192. switch (Tok.getKind()) {
  193. case tok::unknown:
  194. if (startsWithNewline(SM, Tok))
  195. // We're done, but include this newline.
  196. return Tok.getEndLoc();
  197. break;
  198. case tok::comment:
  199. // Include any comments we find on the way.
  200. End = Tok.getEndLoc();
  201. break;
  202. case tok::semi:
  203. case tok::comma:
  204. if (TerminatedByMacro && contains(Terminators, Tok)) {
  205. End = Tok.getEndLoc();
  206. // We've found a real terminator.
  207. TerminatedByMacro = false;
  208. break;
  209. }
  210. // Found an unrelated token; stop and don't include it.
  211. return End;
  212. default:
  213. // Found an unrelated token; stop and don't include it.
  214. return End;
  215. }
  216. } while (true);
  217. }
  218. // Returns the expected terminator tokens for the given declaration.
  219. //
  220. // If we do not know the correct terminator token, returns an empty set.
  221. //
  222. // There are cases where we have more than one possible terminator (for example,
  223. // we find either a comma or a semicolon after a VarDecl).
  224. static std::set<tok::TokenKind> getTerminators(const Decl &D) {
  225. if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
  226. return {tok::semi};
  227. if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
  228. return {tok::r_brace, tok::semi};
  229. if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
  230. return {tok::comma, tok::semi};
  231. return {};
  232. }
  233. // Starting from `Loc`, skips whitespace up to, and including, a single
  234. // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
  235. // location immediately after the whitespace).
  236. static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
  237. SourceLocation Loc,
  238. const LangOptions &LangOpts) {
  239. const char *LocChars = SM.getCharacterData(Loc);
  240. int i = 0;
  241. while (isHorizontalWhitespace(LocChars[i]))
  242. ++i;
  243. if (isVerticalWhitespace(LocChars[i]))
  244. ++i;
  245. return Loc.getLocWithOffset(i);
  246. }
  247. // Is `Loc` separated from any following decl by something meaningful (e.g. an
  248. // empty line, a comment), ignoring horizontal whitespace? Since this is a
  249. // heuristic, we return false when in doubt. `Loc` cannot be the first location
  250. // in the file.
  251. static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
  252. const LangOptions &LangOpts) {
  253. // If the preceding character is a newline, we'll check for an empty line as a
  254. // separator. However, we can't identify an empty line using tokens, so we
  255. // analyse the characters. If we try to use tokens, we'll just end up with a
  256. // whitespace token, whose characters we'd have to analyse anyhow.
  257. bool Invalid = false;
  258. const char *LocChars =
  259. SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
  260. assert(!Invalid &&
  261. "Loc must be a valid character and not the first of the source file.");
  262. if (isVerticalWhitespace(LocChars[0])) {
  263. for (int i = 1; isWhitespace(LocChars[i]); ++i)
  264. if (isVerticalWhitespace(LocChars[i]))
  265. return true;
  266. }
  267. // We didn't find an empty line, so lex the next token, skipping past any
  268. // whitespace we just scanned.
  269. Token Tok;
  270. bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
  271. /*IgnoreWhiteSpace=*/true);
  272. if (Failed)
  273. // Any text that confuses the lexer seems fair to consider a separation.
  274. return true;
  275. switch (Tok.getKind()) {
  276. case tok::comment:
  277. case tok::l_brace:
  278. case tok::r_brace:
  279. case tok::eof:
  280. return true;
  281. default:
  282. return false;
  283. }
  284. }
  285. CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
  286. ASTContext &Context) {
  287. const SourceManager &SM = Context.getSourceManager();
  288. const LangOptions &LangOpts = Context.getLangOpts();
  289. CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
  290. // First, expand to the start of the template<> declaration if necessary.
  291. if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
  292. if (const auto *T = Record->getDescribedClassTemplate())
  293. if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
  294. Range.setBegin(T->getBeginLoc());
  295. } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
  296. if (const auto *T = F->getDescribedFunctionTemplate())
  297. if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
  298. Range.setBegin(T->getBeginLoc());
  299. }
  300. // Next, expand the end location past trailing comments to include a potential
  301. // newline at the end of the decl's line.
  302. Range.setEnd(
  303. getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
  304. Range.setTokenRange(false);
  305. // Expand to include preceeding associated comments. We ignore any comments
  306. // that are not preceeding the decl, since we've already skipped trailing
  307. // comments with getEntityEndLoc.
  308. if (const RawComment *Comment =
  309. Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
  310. // Only include a preceding comment if:
  311. // * it is *not* separate from the declaration (not including any newline
  312. // that immediately follows the comment),
  313. // * the decl *is* separate from any following entity (so, there are no
  314. // other entities the comment could refer to), and
  315. // * it is not a IfThisThenThat lint check.
  316. if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
  317. Range.getBegin()) &&
  318. !atOrBeforeSeparation(
  319. SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
  320. LangOpts) &&
  321. atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
  322. const StringRef CommentText = Comment->getRawText(SM);
  323. if (!CommentText.contains("LINT.IfChange") &&
  324. !CommentText.contains("LINT.ThenChange"))
  325. Range.setBegin(Comment->getBeginLoc());
  326. }
  327. // Add leading attributes.
  328. for (auto *Attr : Decl.attrs()) {
  329. if (Attr->getLocation().isInvalid() ||
  330. !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
  331. continue;
  332. Range.setBegin(Attr->getLocation());
  333. // Extend to the left '[[' or '__attribute((' if we saw the attribute,
  334. // unless it is not a valid location.
  335. bool Invalid;
  336. StringRef Source =
  337. SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
  338. if (Invalid)
  339. continue;
  340. llvm::StringRef BeforeAttr =
  341. Source.substr(0, SM.getFileOffset(Range.getBegin()));
  342. llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
  343. for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
  344. // Handle whitespace between attribute prefix and attribute value.
  345. if (BeforeAttrStripped.endswith(Prefix)) {
  346. // Move start to start position of prefix, which is
  347. // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
  348. // positions to the left.
  349. Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
  350. -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
  351. break;
  352. // If we didn't see '[[' or '__attribute' it's probably coming from a
  353. // macro expansion which is already handled by makeFileCharRange(),
  354. // below.
  355. }
  356. }
  357. }
  358. // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
  359. // Range.getBegin() may be inside an expansion.
  360. return Lexer::makeFileCharRange(Range, SM, LangOpts);
  361. }