CommentLexer.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. //===--- CommentLexer.cpp -------------------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "clang/AST/CommentLexer.h"
  9. #include "clang/AST/CommentCommandTraits.h"
  10. #include "clang/AST/CommentDiagnostic.h"
  11. #include "clang/Basic/CharInfo.h"
  12. #include "llvm/ADT/StringExtras.h"
  13. #include "llvm/ADT/StringSwitch.h"
  14. #include "llvm/Support/ConvertUTF.h"
  15. #include "llvm/Support/ErrorHandling.h"
  16. namespace clang {
  17. namespace comments {
  18. void Token::dump(const Lexer &L, const SourceManager &SM) const {
  19. llvm::errs() << "comments::Token Kind=" << Kind << " ";
  20. Loc.print(llvm::errs(), SM);
  21. llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  22. }
  23. static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  24. return isLetter(C);
  25. }
  26. static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  27. return isDigit(C);
  28. }
  29. static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  30. return isHexDigit(C);
  31. }
  32. static inline StringRef convertCodePointToUTF8(
  33. llvm::BumpPtrAllocator &Allocator,
  34. unsigned CodePoint) {
  35. char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  36. char *ResolvedPtr = Resolved;
  37. if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  38. return StringRef(Resolved, ResolvedPtr - Resolved);
  39. else
  40. return StringRef();
  41. }
  42. namespace {
  43. #include "clang/AST/CommentHTMLTags.inc"
  44. #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  45. } // end anonymous namespace
  46. StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  47. // Fast path, first check a few most widely used named character references.
  48. return llvm::StringSwitch<StringRef>(Name)
  49. .Case("amp", "&")
  50. .Case("lt", "<")
  51. .Case("gt", ">")
  52. .Case("quot", "\"")
  53. .Case("apos", "\'")
  54. // Slow path.
  55. .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  56. }
  57. StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  58. unsigned CodePoint = 0;
  59. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  60. assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  61. CodePoint *= 10;
  62. CodePoint += Name[i] - '0';
  63. }
  64. return convertCodePointToUTF8(Allocator, CodePoint);
  65. }
  66. StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  67. unsigned CodePoint = 0;
  68. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  69. CodePoint *= 16;
  70. const char C = Name[i];
  71. assert(isHTMLHexCharacterReferenceCharacter(C));
  72. CodePoint += llvm::hexDigitValue(C);
  73. }
  74. return convertCodePointToUTF8(Allocator, CodePoint);
  75. }
  76. void Lexer::skipLineStartingDecorations() {
  77. // This function should be called only for C comments
  78. assert(CommentState == LCS_InsideCComment);
  79. if (BufferPtr == CommentEnd)
  80. return;
  81. const char *NewBufferPtr = BufferPtr;
  82. while (isHorizontalWhitespace(*NewBufferPtr))
  83. if (++NewBufferPtr == CommentEnd)
  84. return;
  85. if (*NewBufferPtr == '*')
  86. BufferPtr = NewBufferPtr + 1;
  87. }
  88. namespace {
  89. /// Returns pointer to the first newline character in the string.
  90. const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
  91. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  92. if (isVerticalWhitespace(*BufferPtr))
  93. return BufferPtr;
  94. }
  95. return BufferEnd;
  96. }
  97. const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
  98. if (BufferPtr == BufferEnd)
  99. return BufferPtr;
  100. if (*BufferPtr == '\n')
  101. BufferPtr++;
  102. else {
  103. assert(*BufferPtr == '\r');
  104. BufferPtr++;
  105. if (BufferPtr != BufferEnd && *BufferPtr == '\n')
  106. BufferPtr++;
  107. }
  108. return BufferPtr;
  109. }
  110. const char *skipNamedCharacterReference(const char *BufferPtr,
  111. const char *BufferEnd) {
  112. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  113. if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
  114. return BufferPtr;
  115. }
  116. return BufferEnd;
  117. }
  118. const char *skipDecimalCharacterReference(const char *BufferPtr,
  119. const char *BufferEnd) {
  120. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  121. if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
  122. return BufferPtr;
  123. }
  124. return BufferEnd;
  125. }
  126. const char *skipHexCharacterReference(const char *BufferPtr,
  127. const char *BufferEnd) {
  128. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  129. if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
  130. return BufferPtr;
  131. }
  132. return BufferEnd;
  133. }
  134. bool isHTMLIdentifierStartingCharacter(char C) {
  135. return isLetter(C);
  136. }
  137. bool isHTMLIdentifierCharacter(char C) {
  138. return isAlphanumeric(C);
  139. }
  140. const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
  141. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  142. if (!isHTMLIdentifierCharacter(*BufferPtr))
  143. return BufferPtr;
  144. }
  145. return BufferEnd;
  146. }
  147. /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
  148. /// string allowed.
  149. ///
  150. /// Returns pointer to closing quote.
  151. const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
  152. {
  153. const char Quote = *BufferPtr;
  154. assert(Quote == '\"' || Quote == '\'');
  155. BufferPtr++;
  156. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  157. const char C = *BufferPtr;
  158. if (C == Quote && BufferPtr[-1] != '\\')
  159. return BufferPtr;
  160. }
  161. return BufferEnd;
  162. }
  163. const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
  164. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  165. if (!isWhitespace(*BufferPtr))
  166. return BufferPtr;
  167. }
  168. return BufferEnd;
  169. }
  170. bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
  171. return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
  172. }
  173. bool isCommandNameStartCharacter(char C) {
  174. return isLetter(C);
  175. }
  176. bool isCommandNameCharacter(char C) {
  177. return isAlphanumeric(C);
  178. }
  179. const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
  180. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  181. if (!isCommandNameCharacter(*BufferPtr))
  182. return BufferPtr;
  183. }
  184. return BufferEnd;
  185. }
  186. /// Return the one past end pointer for BCPL comments.
  187. /// Handles newlines escaped with backslash or trigraph for backslahs.
  188. const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  189. const char *CurPtr = BufferPtr;
  190. while (CurPtr != BufferEnd) {
  191. while (!isVerticalWhitespace(*CurPtr)) {
  192. CurPtr++;
  193. if (CurPtr == BufferEnd)
  194. return BufferEnd;
  195. }
  196. // We found a newline, check if it is escaped.
  197. const char *EscapePtr = CurPtr - 1;
  198. while(isHorizontalWhitespace(*EscapePtr))
  199. EscapePtr--;
  200. if (*EscapePtr == '\\' ||
  201. (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
  202. EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
  203. // We found an escaped newline.
  204. CurPtr = skipNewline(CurPtr, BufferEnd);
  205. } else
  206. return CurPtr; // Not an escaped newline.
  207. }
  208. return BufferEnd;
  209. }
  210. /// Return the one past end pointer for C comments.
  211. /// Very dumb, does not handle escaped newlines or trigraphs.
  212. const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  213. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  214. if (*BufferPtr == '*') {
  215. assert(BufferPtr + 1 != BufferEnd);
  216. if (*(BufferPtr + 1) == '/')
  217. return BufferPtr;
  218. }
  219. }
  220. llvm_unreachable("buffer end hit before '*/' was seen");
  221. }
  222. } // end anonymous namespace
  223. void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
  224. tok::TokenKind Kind) {
  225. const unsigned TokLen = TokEnd - BufferPtr;
  226. Result.setLocation(getSourceLocation(BufferPtr));
  227. Result.setKind(Kind);
  228. Result.setLength(TokLen);
  229. #ifndef NDEBUG
  230. Result.TextPtr = "<UNSET>";
  231. Result.IntVal = 7;
  232. #endif
  233. BufferPtr = TokEnd;
  234. }
  235. const char *Lexer::skipTextToken() {
  236. const char *TokenPtr = BufferPtr;
  237. assert(TokenPtr < CommentEnd);
  238. StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
  239. again:
  240. size_t End =
  241. StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
  242. if (End == StringRef::npos)
  243. return CommentEnd;
  244. // Doxygen doesn't recognize any commands in a one-line double quotation.
  245. // If we don't find an ending quotation mark, we pretend it never began.
  246. if (*(TokenPtr + End) == '\"') {
  247. TokenPtr += End + 1;
  248. End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
  249. if (End != StringRef::npos && *(TokenPtr + End) == '\"')
  250. TokenPtr += End + 1;
  251. goto again;
  252. }
  253. return TokenPtr + End;
  254. }
  255. void Lexer::lexCommentText(Token &T) {
  256. assert(CommentState == LCS_InsideBCPLComment ||
  257. CommentState == LCS_InsideCComment);
  258. // Handles lexing non-command text, i.e. text and newline.
  259. auto HandleNonCommandToken = [&]() -> void {
  260. assert(State == LS_Normal);
  261. const char *TokenPtr = BufferPtr;
  262. assert(TokenPtr < CommentEnd);
  263. switch (*TokenPtr) {
  264. case '\n':
  265. case '\r':
  266. TokenPtr = skipNewline(TokenPtr, CommentEnd);
  267. formTokenWithChars(T, TokenPtr, tok::newline);
  268. if (CommentState == LCS_InsideCComment)
  269. skipLineStartingDecorations();
  270. return;
  271. default:
  272. return formTextToken(T, skipTextToken());
  273. }
  274. };
  275. if (!ParseCommands)
  276. return HandleNonCommandToken();
  277. switch (State) {
  278. case LS_Normal:
  279. break;
  280. case LS_VerbatimBlockFirstLine:
  281. lexVerbatimBlockFirstLine(T);
  282. return;
  283. case LS_VerbatimBlockBody:
  284. lexVerbatimBlockBody(T);
  285. return;
  286. case LS_VerbatimLineText:
  287. lexVerbatimLineText(T);
  288. return;
  289. case LS_HTMLStartTag:
  290. lexHTMLStartTag(T);
  291. return;
  292. case LS_HTMLEndTag:
  293. lexHTMLEndTag(T);
  294. return;
  295. }
  296. assert(State == LS_Normal);
  297. const char *TokenPtr = BufferPtr;
  298. assert(TokenPtr < CommentEnd);
  299. switch(*TokenPtr) {
  300. case '\\':
  301. case '@': {
  302. // Commands that start with a backslash and commands that start with
  303. // 'at' have equivalent semantics. But we keep information about the
  304. // exact syntax in AST for comments.
  305. tok::TokenKind CommandKind =
  306. (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
  307. TokenPtr++;
  308. if (TokenPtr == CommentEnd) {
  309. formTextToken(T, TokenPtr);
  310. return;
  311. }
  312. char C = *TokenPtr;
  313. switch (C) {
  314. default:
  315. break;
  316. case '\\': case '@': case '&': case '$':
  317. case '#': case '<': case '>': case '%':
  318. case '\"': case '.': case ':':
  319. // This is one of \\ \@ \& \$ etc escape sequences.
  320. TokenPtr++;
  321. if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
  322. // This is the \:: escape sequence.
  323. TokenPtr++;
  324. }
  325. StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
  326. formTokenWithChars(T, TokenPtr, tok::text);
  327. T.setText(UnescapedText);
  328. return;
  329. }
  330. // Don't make zero-length commands.
  331. if (!isCommandNameStartCharacter(*TokenPtr)) {
  332. formTextToken(T, TokenPtr);
  333. return;
  334. }
  335. TokenPtr = skipCommandName(TokenPtr, CommentEnd);
  336. unsigned Length = TokenPtr - (BufferPtr + 1);
  337. // Hardcoded support for lexing LaTeX formula commands
  338. // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
  339. if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
  340. C = *TokenPtr;
  341. if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
  342. C == '{' || C == '}') {
  343. TokenPtr++;
  344. Length++;
  345. }
  346. }
  347. StringRef CommandName(BufferPtr + 1, Length);
  348. const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
  349. if (!Info) {
  350. if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
  351. StringRef CorrectedName = Info->Name;
  352. SourceLocation Loc = getSourceLocation(BufferPtr);
  353. SourceLocation EndLoc = getSourceLocation(TokenPtr);
  354. SourceRange FullRange = SourceRange(Loc, EndLoc);
  355. SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
  356. Diag(Loc, diag::warn_correct_comment_command_name)
  357. << FullRange << CommandName << CorrectedName
  358. << FixItHint::CreateReplacement(CommandRange, CorrectedName);
  359. } else {
  360. formTokenWithChars(T, TokenPtr, tok::unknown_command);
  361. T.setUnknownCommandName(CommandName);
  362. Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
  363. << SourceRange(T.getLocation(), T.getEndLocation());
  364. return;
  365. }
  366. }
  367. if (Info->IsVerbatimBlockCommand) {
  368. setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
  369. return;
  370. }
  371. if (Info->IsVerbatimLineCommand) {
  372. setupAndLexVerbatimLine(T, TokenPtr, Info);
  373. return;
  374. }
  375. formTokenWithChars(T, TokenPtr, CommandKind);
  376. T.setCommandID(Info->getID());
  377. return;
  378. }
  379. case '&':
  380. lexHTMLCharacterReference(T);
  381. return;
  382. case '<': {
  383. TokenPtr++;
  384. if (TokenPtr == CommentEnd) {
  385. formTextToken(T, TokenPtr);
  386. return;
  387. }
  388. const char C = *TokenPtr;
  389. if (isHTMLIdentifierStartingCharacter(C))
  390. setupAndLexHTMLStartTag(T);
  391. else if (C == '/')
  392. setupAndLexHTMLEndTag(T);
  393. else
  394. formTextToken(T, TokenPtr);
  395. return;
  396. }
  397. default:
  398. return HandleNonCommandToken();
  399. }
  400. }
  401. void Lexer::setupAndLexVerbatimBlock(Token &T,
  402. const char *TextBegin,
  403. char Marker, const CommandInfo *Info) {
  404. assert(Info->IsVerbatimBlockCommand);
  405. VerbatimBlockEndCommandName.clear();
  406. VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
  407. VerbatimBlockEndCommandName.append(Info->EndCommandName);
  408. formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
  409. T.setVerbatimBlockID(Info->getID());
  410. // If there is a newline following the verbatim opening command, skip the
  411. // newline so that we don't create an tok::verbatim_block_line with empty
  412. // text content.
  413. if (BufferPtr != CommentEnd &&
  414. isVerticalWhitespace(*BufferPtr)) {
  415. BufferPtr = skipNewline(BufferPtr, CommentEnd);
  416. State = LS_VerbatimBlockBody;
  417. return;
  418. }
  419. State = LS_VerbatimBlockFirstLine;
  420. }
  421. void Lexer::lexVerbatimBlockFirstLine(Token &T) {
  422. again:
  423. assert(BufferPtr < CommentEnd);
  424. // FIXME: It would be better to scan the text once, finding either the block
  425. // end command or newline.
  426. //
  427. // Extract current line.
  428. const char *Newline = findNewline(BufferPtr, CommentEnd);
  429. StringRef Line(BufferPtr, Newline - BufferPtr);
  430. // Look for end command in current line.
  431. size_t Pos = Line.find(VerbatimBlockEndCommandName);
  432. const char *TextEnd;
  433. const char *NextLine;
  434. if (Pos == StringRef::npos) {
  435. // Current line is completely verbatim.
  436. TextEnd = Newline;
  437. NextLine = skipNewline(Newline, CommentEnd);
  438. } else if (Pos == 0) {
  439. // Current line contains just an end command.
  440. const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
  441. StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
  442. formTokenWithChars(T, End, tok::verbatim_block_end);
  443. T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
  444. State = LS_Normal;
  445. return;
  446. } else {
  447. // There is some text, followed by end command. Extract text first.
  448. TextEnd = BufferPtr + Pos;
  449. NextLine = TextEnd;
  450. // If there is only whitespace before end command, skip whitespace.
  451. if (isWhitespace(BufferPtr, TextEnd)) {
  452. BufferPtr = TextEnd;
  453. goto again;
  454. }
  455. }
  456. StringRef Text(BufferPtr, TextEnd - BufferPtr);
  457. formTokenWithChars(T, NextLine, tok::verbatim_block_line);
  458. T.setVerbatimBlockText(Text);
  459. State = LS_VerbatimBlockBody;
  460. }
  461. void Lexer::lexVerbatimBlockBody(Token &T) {
  462. assert(State == LS_VerbatimBlockBody);
  463. if (CommentState == LCS_InsideCComment)
  464. skipLineStartingDecorations();
  465. if (BufferPtr == CommentEnd) {
  466. formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
  467. T.setVerbatimBlockText("");
  468. return;
  469. }
  470. lexVerbatimBlockFirstLine(T);
  471. }
  472. void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
  473. const CommandInfo *Info) {
  474. assert(Info->IsVerbatimLineCommand);
  475. formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
  476. T.setVerbatimLineID(Info->getID());
  477. State = LS_VerbatimLineText;
  478. }
  479. void Lexer::lexVerbatimLineText(Token &T) {
  480. assert(State == LS_VerbatimLineText);
  481. // Extract current line.
  482. const char *Newline = findNewline(BufferPtr, CommentEnd);
  483. StringRef Text(BufferPtr, Newline - BufferPtr);
  484. formTokenWithChars(T, Newline, tok::verbatim_line_text);
  485. T.setVerbatimLineText(Text);
  486. State = LS_Normal;
  487. }
  488. void Lexer::lexHTMLCharacterReference(Token &T) {
  489. const char *TokenPtr = BufferPtr;
  490. assert(*TokenPtr == '&');
  491. TokenPtr++;
  492. if (TokenPtr == CommentEnd) {
  493. formTextToken(T, TokenPtr);
  494. return;
  495. }
  496. const char *NamePtr;
  497. bool isNamed = false;
  498. bool isDecimal = false;
  499. char C = *TokenPtr;
  500. if (isHTMLNamedCharacterReferenceCharacter(C)) {
  501. NamePtr = TokenPtr;
  502. TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
  503. isNamed = true;
  504. } else if (C == '#') {
  505. TokenPtr++;
  506. if (TokenPtr == CommentEnd) {
  507. formTextToken(T, TokenPtr);
  508. return;
  509. }
  510. C = *TokenPtr;
  511. if (isHTMLDecimalCharacterReferenceCharacter(C)) {
  512. NamePtr = TokenPtr;
  513. TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
  514. isDecimal = true;
  515. } else if (C == 'x' || C == 'X') {
  516. TokenPtr++;
  517. NamePtr = TokenPtr;
  518. TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
  519. } else {
  520. formTextToken(T, TokenPtr);
  521. return;
  522. }
  523. } else {
  524. formTextToken(T, TokenPtr);
  525. return;
  526. }
  527. if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
  528. *TokenPtr != ';') {
  529. formTextToken(T, TokenPtr);
  530. return;
  531. }
  532. StringRef Name(NamePtr, TokenPtr - NamePtr);
  533. TokenPtr++; // Skip semicolon.
  534. StringRef Resolved;
  535. if (isNamed)
  536. Resolved = resolveHTMLNamedCharacterReference(Name);
  537. else if (isDecimal)
  538. Resolved = resolveHTMLDecimalCharacterReference(Name);
  539. else
  540. Resolved = resolveHTMLHexCharacterReference(Name);
  541. if (Resolved.empty()) {
  542. formTextToken(T, TokenPtr);
  543. return;
  544. }
  545. formTokenWithChars(T, TokenPtr, tok::text);
  546. T.setText(Resolved);
  547. }
  548. void Lexer::setupAndLexHTMLStartTag(Token &T) {
  549. assert(BufferPtr[0] == '<' &&
  550. isHTMLIdentifierStartingCharacter(BufferPtr[1]));
  551. const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  552. StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
  553. if (!isHTMLTagName(Name)) {
  554. formTextToken(T, TagNameEnd);
  555. return;
  556. }
  557. formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
  558. T.setHTMLTagStartName(Name);
  559. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  560. const char C = *BufferPtr;
  561. if (BufferPtr != CommentEnd &&
  562. (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
  563. State = LS_HTMLStartTag;
  564. }
  565. void Lexer::lexHTMLStartTag(Token &T) {
  566. assert(State == LS_HTMLStartTag);
  567. const char *TokenPtr = BufferPtr;
  568. char C = *TokenPtr;
  569. if (isHTMLIdentifierCharacter(C)) {
  570. TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
  571. StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
  572. formTokenWithChars(T, TokenPtr, tok::html_ident);
  573. T.setHTMLIdent(Ident);
  574. } else {
  575. switch (C) {
  576. case '=':
  577. TokenPtr++;
  578. formTokenWithChars(T, TokenPtr, tok::html_equals);
  579. break;
  580. case '\"':
  581. case '\'': {
  582. const char *OpenQuote = TokenPtr;
  583. TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
  584. const char *ClosingQuote = TokenPtr;
  585. if (TokenPtr != CommentEnd) // Skip closing quote.
  586. TokenPtr++;
  587. formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
  588. T.setHTMLQuotedString(StringRef(OpenQuote + 1,
  589. ClosingQuote - (OpenQuote + 1)));
  590. break;
  591. }
  592. case '>':
  593. TokenPtr++;
  594. formTokenWithChars(T, TokenPtr, tok::html_greater);
  595. State = LS_Normal;
  596. return;
  597. case '/':
  598. TokenPtr++;
  599. if (TokenPtr != CommentEnd && *TokenPtr == '>') {
  600. TokenPtr++;
  601. formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
  602. } else
  603. formTextToken(T, TokenPtr);
  604. State = LS_Normal;
  605. return;
  606. }
  607. }
  608. // Now look ahead and return to normal state if we don't see any HTML tokens
  609. // ahead.
  610. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  611. if (BufferPtr == CommentEnd) {
  612. State = LS_Normal;
  613. return;
  614. }
  615. C = *BufferPtr;
  616. if (!isHTMLIdentifierStartingCharacter(C) &&
  617. C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
  618. State = LS_Normal;
  619. return;
  620. }
  621. }
  622. void Lexer::setupAndLexHTMLEndTag(Token &T) {
  623. assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
  624. const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
  625. const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
  626. StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
  627. if (!isHTMLTagName(Name)) {
  628. formTextToken(T, TagNameEnd);
  629. return;
  630. }
  631. const char *End = skipWhitespace(TagNameEnd, CommentEnd);
  632. formTokenWithChars(T, End, tok::html_end_tag);
  633. T.setHTMLTagEndName(Name);
  634. if (BufferPtr != CommentEnd && *BufferPtr == '>')
  635. State = LS_HTMLEndTag;
  636. }
  637. void Lexer::lexHTMLEndTag(Token &T) {
  638. assert(BufferPtr != CommentEnd && *BufferPtr == '>');
  639. formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
  640. State = LS_Normal;
  641. }
  642. Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
  643. const CommandTraits &Traits, SourceLocation FileLoc,
  644. const char *BufferStart, const char *BufferEnd, bool ParseCommands)
  645. : Allocator(Allocator), Diags(Diags), Traits(Traits),
  646. BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
  647. FileLoc(FileLoc), ParseCommands(ParseCommands),
  648. CommentState(LCS_BeforeComment), State(LS_Normal) {}
  649. void Lexer::lex(Token &T) {
  650. again:
  651. switch (CommentState) {
  652. case LCS_BeforeComment:
  653. if (BufferPtr == BufferEnd) {
  654. formTokenWithChars(T, BufferPtr, tok::eof);
  655. return;
  656. }
  657. assert(*BufferPtr == '/');
  658. BufferPtr++; // Skip first slash.
  659. switch(*BufferPtr) {
  660. case '/': { // BCPL comment.
  661. BufferPtr++; // Skip second slash.
  662. if (BufferPtr != BufferEnd) {
  663. // Skip Doxygen magic marker, if it is present.
  664. // It might be missing because of a typo //< or /*<, or because we
  665. // merged this non-Doxygen comment into a bunch of Doxygen comments
  666. // around it: /** ... */ /* ... */ /** ... */
  667. const char C = *BufferPtr;
  668. if (C == '/' || C == '!')
  669. BufferPtr++;
  670. }
  671. // Skip less-than symbol that marks trailing comments.
  672. // Skip it even if the comment is not a Doxygen one, because //< and /*<
  673. // are frequent typos.
  674. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  675. BufferPtr++;
  676. CommentState = LCS_InsideBCPLComment;
  677. if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
  678. State = LS_Normal;
  679. CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
  680. goto again;
  681. }
  682. case '*': { // C comment.
  683. BufferPtr++; // Skip star.
  684. // Skip Doxygen magic marker.
  685. const char C = *BufferPtr;
  686. if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
  687. BufferPtr++;
  688. // Skip less-than symbol that marks trailing comments.
  689. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  690. BufferPtr++;
  691. CommentState = LCS_InsideCComment;
  692. State = LS_Normal;
  693. CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
  694. goto again;
  695. }
  696. default:
  697. llvm_unreachable("second character of comment should be '/' or '*'");
  698. }
  699. case LCS_BetweenComments: {
  700. // Consecutive comments are extracted only if there is only whitespace
  701. // between them. So we can search for the start of the next comment.
  702. const char *EndWhitespace = BufferPtr;
  703. while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
  704. EndWhitespace++;
  705. // Turn any whitespace between comments (and there is only whitespace
  706. // between them -- guaranteed by comment extraction) into a newline. We
  707. // have two newlines between C comments in total (first one was synthesized
  708. // after a comment).
  709. formTokenWithChars(T, EndWhitespace, tok::newline);
  710. CommentState = LCS_BeforeComment;
  711. break;
  712. }
  713. case LCS_InsideBCPLComment:
  714. case LCS_InsideCComment:
  715. if (BufferPtr != CommentEnd) {
  716. lexCommentText(T);
  717. break;
  718. } else {
  719. // Skip C comment closing sequence.
  720. if (CommentState == LCS_InsideCComment) {
  721. assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
  722. BufferPtr += 2;
  723. assert(BufferPtr <= BufferEnd);
  724. // Synthenize newline just after the C comment, regardless if there is
  725. // actually a newline.
  726. formTokenWithChars(T, BufferPtr, tok::newline);
  727. CommentState = LCS_BetweenComments;
  728. break;
  729. } else {
  730. // Don't synthesized a newline after BCPL comment.
  731. CommentState = LCS_BetweenComments;
  732. goto again;
  733. }
  734. }
  735. }
  736. }
  737. StringRef Lexer::getSpelling(const Token &Tok,
  738. const SourceManager &SourceMgr) const {
  739. SourceLocation Loc = Tok.getLocation();
  740. std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
  741. bool InvalidTemp = false;
  742. StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
  743. if (InvalidTemp)
  744. return StringRef();
  745. const char *Begin = File.data() + LocInfo.second;
  746. return StringRef(Begin, Tok.getLength());
  747. }
  748. } // end namespace comments
  749. } // end namespace clang