RawCommentList.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "clang/AST/RawCommentList.h"
  9. #include "clang/AST/ASTContext.h"
  10. #include "clang/AST/Comment.h"
  11. #include "clang/AST/CommentBriefParser.h"
  12. #include "clang/AST/CommentCommandTraits.h"
  13. #include "clang/AST/CommentLexer.h"
  14. #include "clang/AST/CommentParser.h"
  15. #include "clang/AST/CommentSema.h"
  16. #include "clang/Basic/CharInfo.h"
  17. #include "llvm/ADT/STLExtras.h"
  18. #include "llvm/Support/Allocator.h"
  19. using namespace clang;
  20. namespace {
  21. /// Get comment kind and bool describing if it is a trailing comment.
  22. std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
  23. bool ParseAllComments) {
  24. const size_t MinCommentLength = ParseAllComments ? 2 : 3;
  25. if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
  26. return std::make_pair(RawComment::RCK_Invalid, false);
  27. RawComment::CommentKind K;
  28. if (Comment[1] == '/') {
  29. if (Comment.size() < 3)
  30. return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  31. if (Comment[2] == '/')
  32. K = RawComment::RCK_BCPLSlash;
  33. else if (Comment[2] == '!')
  34. K = RawComment::RCK_BCPLExcl;
  35. else
  36. return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  37. } else {
  38. assert(Comment.size() >= 4);
  39. // Comment lexer does not understand escapes in comment markers, so pretend
  40. // that this is not a comment.
  41. if (Comment[1] != '*' ||
  42. Comment[Comment.size() - 2] != '*' ||
  43. Comment[Comment.size() - 1] != '/')
  44. return std::make_pair(RawComment::RCK_Invalid, false);
  45. if (Comment[2] == '*')
  46. K = RawComment::RCK_JavaDoc;
  47. else if (Comment[2] == '!')
  48. K = RawComment::RCK_Qt;
  49. else
  50. return std::make_pair(RawComment::RCK_OrdinaryC, false);
  51. }
  52. const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
  53. return std::make_pair(K, TrailingComment);
  54. }
  55. bool mergedCommentIsTrailingComment(StringRef Comment) {
  56. return (Comment.size() > 3) && (Comment[3] == '<');
  57. }
  58. /// Returns true if R1 and R2 both have valid locations that start on the same
  59. /// column.
  60. bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
  61. const RawComment &R2) {
  62. SourceLocation L1 = R1.getBeginLoc();
  63. SourceLocation L2 = R2.getBeginLoc();
  64. bool Invalid = false;
  65. unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
  66. if (!Invalid) {
  67. unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
  68. return !Invalid && (C1 == C2);
  69. }
  70. return false;
  71. }
  72. } // unnamed namespace
  73. /// Determines whether there is only whitespace in `Buffer` between `P`
  74. /// and the previous line.
  75. /// \param Buffer The buffer to search in.
  76. /// \param P The offset from the beginning of `Buffer` to start from.
  77. /// \return true if all of the characters in `Buffer` ranging from the closest
  78. /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
  79. /// are whitespace.
  80. static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
  81. // Search backwards until we see linefeed or carriage return.
  82. for (unsigned I = P; I != 0; --I) {
  83. char C = Buffer[I - 1];
  84. if (isVerticalWhitespace(C))
  85. return true;
  86. if (!isHorizontalWhitespace(C))
  87. return false;
  88. }
  89. // We hit the beginning of the buffer.
  90. return true;
  91. }
  92. /// Returns whether `K` is an ordinary comment kind.
  93. static bool isOrdinaryKind(RawComment::CommentKind K) {
  94. return (K == RawComment::RCK_OrdinaryBCPL) ||
  95. (K == RawComment::RCK_OrdinaryC);
  96. }
  97. RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
  98. const CommentOptions &CommentOpts, bool Merged) :
  99. Range(SR), RawTextValid(false), BriefTextValid(false),
  100. IsAttached(false), IsTrailingComment(false),
  101. IsAlmostTrailingComment(false) {
  102. // Extract raw comment text, if possible.
  103. if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
  104. Kind = RCK_Invalid;
  105. return;
  106. }
  107. // Guess comment kind.
  108. std::pair<CommentKind, bool> K =
  109. getCommentKind(RawText, CommentOpts.ParseAllComments);
  110. // Guess whether an ordinary comment is trailing.
  111. if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
  112. FileID BeginFileID;
  113. unsigned BeginOffset;
  114. std::tie(BeginFileID, BeginOffset) =
  115. SourceMgr.getDecomposedLoc(Range.getBegin());
  116. if (BeginOffset != 0) {
  117. bool Invalid = false;
  118. const char *Buffer =
  119. SourceMgr.getBufferData(BeginFileID, &Invalid).data();
  120. IsTrailingComment |=
  121. (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
  122. }
  123. }
  124. if (!Merged) {
  125. Kind = K.first;
  126. IsTrailingComment |= K.second;
  127. IsAlmostTrailingComment = RawText.startswith("//<") ||
  128. RawText.startswith("/*<");
  129. } else {
  130. Kind = RCK_Merged;
  131. IsTrailingComment =
  132. IsTrailingComment || mergedCommentIsTrailingComment(RawText);
  133. }
  134. }
  135. StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
  136. FileID BeginFileID;
  137. FileID EndFileID;
  138. unsigned BeginOffset;
  139. unsigned EndOffset;
  140. std::tie(BeginFileID, BeginOffset) =
  141. SourceMgr.getDecomposedLoc(Range.getBegin());
  142. std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
  143. const unsigned Length = EndOffset - BeginOffset;
  144. if (Length < 2)
  145. return StringRef();
  146. // The comment can't begin in one file and end in another.
  147. assert(BeginFileID == EndFileID);
  148. bool Invalid = false;
  149. const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
  150. &Invalid).data();
  151. if (Invalid)
  152. return StringRef();
  153. return StringRef(BufferStart + BeginOffset, Length);
  154. }
  155. const char *RawComment::extractBriefText(const ASTContext &Context) const {
  156. // Lazily initialize RawText using the accessor before using it.
  157. (void)getRawText(Context.getSourceManager());
  158. // Since we will be copying the resulting text, all allocations made during
  159. // parsing are garbage after resulting string is formed. Thus we can use
  160. // a separate allocator for all temporary stuff.
  161. llvm::BumpPtrAllocator Allocator;
  162. comments::Lexer L(Allocator, Context.getDiagnostics(),
  163. Context.getCommentCommandTraits(),
  164. Range.getBegin(),
  165. RawText.begin(), RawText.end());
  166. comments::BriefParser P(L, Context.getCommentCommandTraits());
  167. const std::string Result = P.Parse();
  168. const unsigned BriefTextLength = Result.size();
  169. char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
  170. memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
  171. BriefText = BriefTextPtr;
  172. BriefTextValid = true;
  173. return BriefTextPtr;
  174. }
  175. comments::FullComment *RawComment::parse(const ASTContext &Context,
  176. const Preprocessor *PP,
  177. const Decl *D) const {
  178. // Lazily initialize RawText using the accessor before using it.
  179. (void)getRawText(Context.getSourceManager());
  180. comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
  181. Context.getCommentCommandTraits(),
  182. getSourceRange().getBegin(),
  183. RawText.begin(), RawText.end());
  184. comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
  185. Context.getDiagnostics(),
  186. Context.getCommentCommandTraits(),
  187. PP);
  188. S.setDecl(D);
  189. comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
  190. Context.getDiagnostics(),
  191. Context.getCommentCommandTraits());
  192. return P.parseFullComment();
  193. }
  194. static bool onlyWhitespaceBetween(SourceManager &SM,
  195. SourceLocation Loc1, SourceLocation Loc2,
  196. unsigned MaxNewlinesAllowed) {
  197. std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
  198. std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
  199. // Question does not make sense if locations are in different files.
  200. if (Loc1Info.first != Loc2Info.first)
  201. return false;
  202. bool Invalid = false;
  203. const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
  204. if (Invalid)
  205. return false;
  206. unsigned NumNewlines = 0;
  207. assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
  208. // Look for non-whitespace characters and remember any newlines seen.
  209. for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
  210. switch (Buffer[I]) {
  211. default:
  212. return false;
  213. case ' ':
  214. case '\t':
  215. case '\f':
  216. case '\v':
  217. break;
  218. case '\r':
  219. case '\n':
  220. ++NumNewlines;
  221. // Check if we have found more than the maximum allowed number of
  222. // newlines.
  223. if (NumNewlines > MaxNewlinesAllowed)
  224. return false;
  225. // Collapse \r\n and \n\r into a single newline.
  226. if (I + 1 != Loc2Info.second &&
  227. (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
  228. Buffer[I] != Buffer[I + 1])
  229. ++I;
  230. break;
  231. }
  232. }
  233. return true;
  234. }
  235. void RawCommentList::addComment(const RawComment &RC,
  236. const CommentOptions &CommentOpts,
  237. llvm::BumpPtrAllocator &Allocator) {
  238. if (RC.isInvalid())
  239. return;
  240. // Ordinary comments are not interesting for us.
  241. if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
  242. return;
  243. std::pair<FileID, unsigned> Loc =
  244. SourceMgr.getDecomposedLoc(RC.getBeginLoc());
  245. const FileID CommentFile = Loc.first;
  246. const unsigned CommentOffset = Loc.second;
  247. // If this is the first Doxygen comment, save it (because there isn't
  248. // anything to merge it with).
  249. if (OrderedComments[CommentFile].empty()) {
  250. OrderedComments[CommentFile][CommentOffset] =
  251. new (Allocator) RawComment(RC);
  252. return;
  253. }
  254. const RawComment &C1 = *OrderedComments[CommentFile].rbegin()->second;
  255. const RawComment &C2 = RC;
  256. // Merge comments only if there is only whitespace between them.
  257. // Can't merge trailing and non-trailing comments unless the second is
  258. // non-trailing ordinary in the same column, as in the case:
  259. // int x; // documents x
  260. // // more text
  261. // versus:
  262. // int x; // documents x
  263. // int y; // documents y
  264. // or:
  265. // int x; // documents x
  266. // // documents y
  267. // int y;
  268. // Merge comments if they are on same or consecutive lines.
  269. if ((C1.isTrailingComment() == C2.isTrailingComment() ||
  270. (C1.isTrailingComment() && !C2.isTrailingComment() &&
  271. isOrdinaryKind(C2.getKind()) &&
  272. commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
  273. onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
  274. /*MaxNewlinesAllowed=*/1)) {
  275. SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
  276. *OrderedComments[CommentFile].rbegin()->second =
  277. RawComment(SourceMgr, MergedRange, CommentOpts, true);
  278. } else {
  279. OrderedComments[CommentFile][CommentOffset] =
  280. new (Allocator) RawComment(RC);
  281. }
  282. }
  283. const std::map<unsigned, RawComment *> *
  284. RawCommentList::getCommentsInFile(FileID File) const {
  285. auto CommentsInFile = OrderedComments.find(File);
  286. if (CommentsInFile == OrderedComments.end())
  287. return nullptr;
  288. return &CommentsInFile->second;
  289. }
  290. bool RawCommentList::empty() const { return OrderedComments.empty(); }
  291. unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
  292. unsigned Offset) const {
  293. auto Cached = CommentBeginLine.find(C);
  294. if (Cached != CommentBeginLine.end())
  295. return Cached->second;
  296. const unsigned Line = SourceMgr.getLineNumber(File, Offset);
  297. CommentBeginLine[C] = Line;
  298. return Line;
  299. }
  300. unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
  301. auto Cached = CommentEndOffset.find(C);
  302. if (Cached != CommentEndOffset.end())
  303. return Cached->second;
  304. const unsigned Offset =
  305. SourceMgr.getDecomposedLoc(C->getSourceRange().getEnd()).second;
  306. CommentEndOffset[C] = Offset;
  307. return Offset;
  308. }
  309. std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
  310. DiagnosticsEngine &Diags) const {
  311. llvm::StringRef CommentText = getRawText(SourceMgr);
  312. if (CommentText.empty())
  313. return "";
  314. llvm::BumpPtrAllocator Allocator;
  315. // We do not parse any commands, so CommentOptions are ignored by
  316. // comments::Lexer. Therefore, we just use default-constructed options.
  317. CommentOptions DefOpts;
  318. comments::CommandTraits EmptyTraits(Allocator, DefOpts);
  319. comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
  320. CommentText.begin(), CommentText.end(),
  321. /*ParseCommands=*/false);
  322. std::string Result;
  323. // A column number of the first non-whitespace token in the comment text.
  324. // We skip whitespace up to this column, but keep the whitespace after this
  325. // column. IndentColumn is calculated when lexing the first line and reused
  326. // for the rest of lines.
  327. unsigned IndentColumn = 0;
  328. // Processes one line of the comment and adds it to the result.
  329. // Handles skipping the indent at the start of the line.
  330. // Returns false when eof is reached and true otherwise.
  331. auto LexLine = [&](bool IsFirstLine) -> bool {
  332. comments::Token Tok;
  333. // Lex the first token on the line. We handle it separately, because we to
  334. // fix up its indentation.
  335. L.lex(Tok);
  336. if (Tok.is(comments::tok::eof))
  337. return false;
  338. if (Tok.is(comments::tok::newline)) {
  339. Result += "\n";
  340. return true;
  341. }
  342. llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
  343. bool LocInvalid = false;
  344. unsigned TokColumn =
  345. SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
  346. assert(!LocInvalid && "getFormattedText for invalid location");
  347. // Amount of leading whitespace in TokText.
  348. size_t WhitespaceLen = TokText.find_first_not_of(" \t");
  349. if (WhitespaceLen == StringRef::npos)
  350. WhitespaceLen = TokText.size();
  351. // Remember the amount of whitespace we skipped in the first line to remove
  352. // indent up to that column in the following lines.
  353. if (IsFirstLine)
  354. IndentColumn = TokColumn + WhitespaceLen;
  355. // Amount of leading whitespace we actually want to skip.
  356. // For the first line we skip all the whitespace.
  357. // For the rest of the lines, we skip whitespace up to IndentColumn.
  358. unsigned SkipLen =
  359. IsFirstLine
  360. ? WhitespaceLen
  361. : std::min<size_t>(
  362. WhitespaceLen,
  363. std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
  364. llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
  365. Result += Trimmed;
  366. // Lex all tokens in the rest of the line.
  367. for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
  368. if (Tok.is(comments::tok::newline)) {
  369. Result += "\n";
  370. return true;
  371. }
  372. Result += L.getSpelling(Tok, SourceMgr);
  373. }
  374. // We've reached the end of file token.
  375. return false;
  376. };
  377. auto DropTrailingNewLines = [](std::string &Str) {
  378. while (!Str.empty() && Str.back() == '\n')
  379. Str.pop_back();
  380. };
  381. // Process first line separately to remember indent for the following lines.
  382. if (!LexLine(/*IsFirstLine=*/true)) {
  383. DropTrailingNewLines(Result);
  384. return Result;
  385. }
  386. // Process the rest of the lines.
  387. while (LexLine(/*IsFirstLine=*/false))
  388. ;
  389. DropTrailingNewLines(Result);
  390. return Result;
  391. }