FormatTokenLexer.cpp 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312
  1. //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. ///
  9. /// \file
  10. /// This file implements FormatTokenLexer, which tokenizes a source file
  11. /// into a FormatToken stream suitable for ClangFormat.
  12. ///
  13. //===----------------------------------------------------------------------===//
  14. #include "FormatTokenLexer.h"
  15. #include "FormatToken.h"
  16. #include "clang/Basic/SourceLocation.h"
  17. #include "clang/Basic/SourceManager.h"
  18. #include "clang/Format/Format.h"
  19. #include "llvm/Support/Regex.h"
  20. namespace clang {
  21. namespace format {
  22. FormatTokenLexer::FormatTokenLexer(
  23. const SourceManager &SourceMgr, FileID ID, unsigned Column,
  24. const FormatStyle &Style, encoding::Encoding Encoding,
  25. llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
  26. IdentifierTable &IdentTable)
  27. : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
  28. Column(Column), TrailingWhitespace(0),
  29. LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
  30. Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
  31. Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
  32. FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
  33. MacroBlockEndRegex(Style.MacroBlockEnd) {
  34. Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
  35. Lex->SetKeepWhitespaceMode(true);
  36. for (const std::string &ForEachMacro : Style.ForEachMacros) {
  37. auto Identifier = &IdentTable.get(ForEachMacro);
  38. Macros.insert({Identifier, TT_ForEachMacro});
  39. }
  40. for (const std::string &IfMacro : Style.IfMacros) {
  41. auto Identifier = &IdentTable.get(IfMacro);
  42. Macros.insert({Identifier, TT_IfMacro});
  43. }
  44. for (const std::string &AttributeMacro : Style.AttributeMacros) {
  45. auto Identifier = &IdentTable.get(AttributeMacro);
  46. Macros.insert({Identifier, TT_AttributeMacro});
  47. }
  48. for (const std::string &StatementMacro : Style.StatementMacros) {
  49. auto Identifier = &IdentTable.get(StatementMacro);
  50. Macros.insert({Identifier, TT_StatementMacro});
  51. }
  52. for (const std::string &TypenameMacro : Style.TypenameMacros) {
  53. auto Identifier = &IdentTable.get(TypenameMacro);
  54. Macros.insert({Identifier, TT_TypenameMacro});
  55. }
  56. for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
  57. auto Identifier = &IdentTable.get(NamespaceMacro);
  58. Macros.insert({Identifier, TT_NamespaceMacro});
  59. }
  60. for (const std::string &WhitespaceSensitiveMacro :
  61. Style.WhitespaceSensitiveMacros) {
  62. auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
  63. Macros.insert({Identifier, TT_UntouchableMacroFunc});
  64. }
  65. for (const std::string &StatementAttributeLikeMacro :
  66. Style.StatementAttributeLikeMacros) {
  67. auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
  68. Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
  69. }
  70. }
  71. ArrayRef<FormatToken *> FormatTokenLexer::lex() {
  72. assert(Tokens.empty());
  73. assert(FirstInLineIndex == 0);
  74. do {
  75. Tokens.push_back(getNextToken());
  76. if (Style.isJavaScript()) {
  77. tryParseJSRegexLiteral();
  78. handleTemplateStrings();
  79. }
  80. if (Style.Language == FormatStyle::LK_TextProto)
  81. tryParsePythonComment();
  82. tryMergePreviousTokens();
  83. if (Style.isCSharp()) {
  84. // This needs to come after tokens have been merged so that C#
  85. // string literals are correctly identified.
  86. handleCSharpVerbatimAndInterpolatedStrings();
  87. }
  88. if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
  89. FirstInLineIndex = Tokens.size() - 1;
  90. } while (Tokens.back()->isNot(tok::eof));
  91. return Tokens;
  92. }
  93. void FormatTokenLexer::tryMergePreviousTokens() {
  94. if (tryMerge_TMacro())
  95. return;
  96. if (tryMergeConflictMarkers())
  97. return;
  98. if (tryMergeLessLess())
  99. return;
  100. if (tryMergeForEach())
  101. return;
  102. if (Style.isCpp() && tryTransformTryUsageForC())
  103. return;
  104. if (Style.isJavaScript() || Style.isCSharp()) {
  105. static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
  106. tok::question};
  107. static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
  108. tok::period};
  109. static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
  110. if (tryMergeTokens(FatArrow, TT_FatArrow))
  111. return;
  112. if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
  113. // Treat like the "||" operator (as opposed to the ternary ?).
  114. Tokens.back()->Tok.setKind(tok::pipepipe);
  115. return;
  116. }
  117. if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
  118. // Treat like a regular "." access.
  119. Tokens.back()->Tok.setKind(tok::period);
  120. return;
  121. }
  122. if (tryMergeNullishCoalescingEqual())
  123. return;
  124. }
  125. if (Style.isCSharp()) {
  126. static const tok::TokenKind CSharpNullConditionalLSquare[] = {
  127. tok::question, tok::l_square};
  128. if (tryMergeCSharpKeywordVariables())
  129. return;
  130. if (tryMergeCSharpStringLiteral())
  131. return;
  132. if (tryTransformCSharpForEach())
  133. return;
  134. if (tryMergeTokens(CSharpNullConditionalLSquare,
  135. TT_CSharpNullConditionalLSquare)) {
  136. // Treat like a regular "[" operator.
  137. Tokens.back()->Tok.setKind(tok::l_square);
  138. return;
  139. }
  140. }
  141. if (tryMergeNSStringLiteral())
  142. return;
  143. if (Style.isJavaScript()) {
  144. static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
  145. static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
  146. tok::equal};
  147. static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
  148. tok::greaterequal};
  149. static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
  150. static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
  151. tok::starequal};
  152. static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
  153. static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
  154. // FIXME: Investigate what token type gives the correct operator priority.
  155. if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
  156. return;
  157. if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
  158. return;
  159. if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
  160. return;
  161. if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
  162. return;
  163. if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
  164. Tokens.back()->Tok.setKind(tok::starequal);
  165. return;
  166. }
  167. if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
  168. tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
  169. // Treat like the "=" assignment operator.
  170. Tokens.back()->Tok.setKind(tok::equal);
  171. return;
  172. }
  173. if (tryMergeJSPrivateIdentifier())
  174. return;
  175. }
  176. if (Style.Language == FormatStyle::LK_Java) {
  177. static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
  178. tok::greater, tok::greater, tok::greaterequal};
  179. if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
  180. return;
  181. }
  182. if (Style.isVerilog()) {
  183. // Merge the number following a base like `'h?a0`.
  184. if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
  185. Tokens.end()[-2]->is(tok::numeric_constant) &&
  186. Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
  187. tok::question) &&
  188. tryMergeTokens(2, TT_Unknown)) {
  189. return;
  190. }
  191. // Part select.
  192. if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
  193. TT_BitFieldColon)) {
  194. return;
  195. }
  196. // Xnor. The combined token is treated as a caret which can also be either a
  197. // unary or binary operator. The actual type is determined in
  198. // TokenAnnotator. We also check the token length so we know it is not
  199. // already a merged token.
  200. if (Tokens.back()->TokenText.size() == 1 &&
  201. tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
  202. TT_BinaryOperator)) {
  203. Tokens.back()->Tok.setKind(tok::caret);
  204. return;
  205. }
  206. // Signed shift and distribution weight.
  207. if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
  208. Tokens.back()->Tok.setKind(tok::lessless);
  209. return;
  210. }
  211. if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
  212. Tokens.back()->Tok.setKind(tok::greatergreater);
  213. return;
  214. }
  215. if (tryMergeTokensAny({{tok::lessless, tok::equal},
  216. {tok::lessless, tok::lessequal},
  217. {tok::greatergreater, tok::equal},
  218. {tok::greatergreater, tok::greaterequal},
  219. {tok::colon, tok::equal},
  220. {tok::colon, tok::slash}},
  221. TT_BinaryOperator)) {
  222. Tokens.back()->ForcedPrecedence = prec::Assignment;
  223. return;
  224. }
  225. // Exponentiation, signed shift, case equality, and wildcard equality.
  226. if (tryMergeTokensAny({{tok::star, tok::star},
  227. {tok::lessless, tok::less},
  228. {tok::greatergreater, tok::greater},
  229. {tok::exclaimequal, tok::equal},
  230. {tok::exclaimequal, tok::question},
  231. {tok::equalequal, tok::equal},
  232. {tok::equalequal, tok::question}},
  233. TT_BinaryOperator)) {
  234. return;
  235. }
  236. // Module paths in specify blocks and implications in properties.
  237. if (tryMergeTokensAny({{tok::plusequal, tok::greater},
  238. {tok::plus, tok::star, tok::greater},
  239. {tok::minusequal, tok::greater},
  240. {tok::minus, tok::star, tok::greater},
  241. {tok::less, tok::arrow},
  242. {tok::equal, tok::greater},
  243. {tok::star, tok::greater},
  244. {tok::pipeequal, tok::greater},
  245. {tok::pipe, tok::arrow},
  246. {tok::hash, tok::minus, tok::hash},
  247. {tok::hash, tok::equal, tok::hash}},
  248. TT_BinaryOperator)) {
  249. Tokens.back()->ForcedPrecedence = prec::Comma;
  250. return;
  251. }
  252. }
  253. }
  254. bool FormatTokenLexer::tryMergeNSStringLiteral() {
  255. if (Tokens.size() < 2)
  256. return false;
  257. auto &At = *(Tokens.end() - 2);
  258. auto &String = *(Tokens.end() - 1);
  259. if (!At->is(tok::at) || !String->is(tok::string_literal))
  260. return false;
  261. At->Tok.setKind(tok::string_literal);
  262. At->TokenText = StringRef(At->TokenText.begin(),
  263. String->TokenText.end() - At->TokenText.begin());
  264. At->ColumnWidth += String->ColumnWidth;
  265. At->setType(TT_ObjCStringLiteral);
  266. Tokens.erase(Tokens.end() - 1);
  267. return true;
  268. }
  269. bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
  270. // Merges #idenfier into a single identifier with the text #identifier
  271. // but the token tok::identifier.
  272. if (Tokens.size() < 2)
  273. return false;
  274. auto &Hash = *(Tokens.end() - 2);
  275. auto &Identifier = *(Tokens.end() - 1);
  276. if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
  277. return false;
  278. Hash->Tok.setKind(tok::identifier);
  279. Hash->TokenText =
  280. StringRef(Hash->TokenText.begin(),
  281. Identifier->TokenText.end() - Hash->TokenText.begin());
  282. Hash->ColumnWidth += Identifier->ColumnWidth;
  283. Hash->setType(TT_JsPrivateIdentifier);
  284. Tokens.erase(Tokens.end() - 1);
  285. return true;
  286. }
  287. // Search for verbatim or interpolated string literals @"ABC" or
  288. // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
  289. // prevent splitting of @, $ and ".
  290. // Merging of multiline verbatim strings with embedded '"' is handled in
  291. // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
  292. bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
  293. if (Tokens.size() < 2)
  294. return false;
  295. // Look for @"aaaaaa" or $"aaaaaa".
  296. const auto String = *(Tokens.end() - 1);
  297. if (String->isNot(tok::string_literal))
  298. return false;
  299. auto Prefix = *(Tokens.end() - 2);
  300. if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
  301. return false;
  302. if (Tokens.size() > 2) {
  303. const auto Tok = *(Tokens.end() - 3);
  304. if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
  305. (Tok->is(tok::at) && Prefix->TokenText == "$")) {
  306. // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
  307. Tok->ColumnWidth += Prefix->ColumnWidth;
  308. Tokens.erase(Tokens.end() - 2);
  309. Prefix = Tok;
  310. }
  311. }
  312. // Convert back into just a string_literal.
  313. Prefix->Tok.setKind(tok::string_literal);
  314. Prefix->TokenText =
  315. StringRef(Prefix->TokenText.begin(),
  316. String->TokenText.end() - Prefix->TokenText.begin());
  317. Prefix->ColumnWidth += String->ColumnWidth;
  318. Prefix->setType(TT_CSharpStringLiteral);
  319. Tokens.erase(Tokens.end() - 1);
  320. return true;
  321. }
  322. // Valid C# attribute targets:
  323. // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
  324. const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
  325. "assembly", "module", "field", "event", "method",
  326. "param", "property", "return", "type",
  327. };
  328. bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
  329. if (Tokens.size() < 2)
  330. return false;
  331. auto &NullishCoalescing = *(Tokens.end() - 2);
  332. auto &Equal = *(Tokens.end() - 1);
  333. if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
  334. !Equal->is(tok::equal)) {
  335. return false;
  336. }
  337. NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
  338. NullishCoalescing->TokenText =
  339. StringRef(NullishCoalescing->TokenText.begin(),
  340. Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
  341. NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
  342. NullishCoalescing->setType(TT_NullCoalescingEqual);
  343. Tokens.erase(Tokens.end() - 1);
  344. return true;
  345. }
  346. bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
  347. if (Tokens.size() < 2)
  348. return false;
  349. const auto At = *(Tokens.end() - 2);
  350. if (At->isNot(tok::at))
  351. return false;
  352. const auto Keyword = *(Tokens.end() - 1);
  353. if (Keyword->TokenText == "$")
  354. return false;
  355. if (!Keywords.isCSharpKeyword(*Keyword))
  356. return false;
  357. At->Tok.setKind(tok::identifier);
  358. At->TokenText = StringRef(At->TokenText.begin(),
  359. Keyword->TokenText.end() - At->TokenText.begin());
  360. At->ColumnWidth += Keyword->ColumnWidth;
  361. At->setType(Keyword->getType());
  362. Tokens.erase(Tokens.end() - 1);
  363. return true;
  364. }
  365. // In C# transform identifier foreach into kw_foreach
  366. bool FormatTokenLexer::tryTransformCSharpForEach() {
  367. if (Tokens.size() < 1)
  368. return false;
  369. auto &Identifier = *(Tokens.end() - 1);
  370. if (!Identifier->is(tok::identifier))
  371. return false;
  372. if (Identifier->TokenText != "foreach")
  373. return false;
  374. Identifier->setType(TT_ForEachMacro);
  375. Identifier->Tok.setKind(tok::kw_for);
  376. return true;
  377. }
  378. bool FormatTokenLexer::tryMergeForEach() {
  379. if (Tokens.size() < 2)
  380. return false;
  381. auto &For = *(Tokens.end() - 2);
  382. auto &Each = *(Tokens.end() - 1);
  383. if (!For->is(tok::kw_for))
  384. return false;
  385. if (!Each->is(tok::identifier))
  386. return false;
  387. if (Each->TokenText != "each")
  388. return false;
  389. For->setType(TT_ForEachMacro);
  390. For->Tok.setKind(tok::kw_for);
  391. For->TokenText = StringRef(For->TokenText.begin(),
  392. Each->TokenText.end() - For->TokenText.begin());
  393. For->ColumnWidth += Each->ColumnWidth;
  394. Tokens.erase(Tokens.end() - 1);
  395. return true;
  396. }
  397. bool FormatTokenLexer::tryTransformTryUsageForC() {
  398. if (Tokens.size() < 2)
  399. return false;
  400. auto &Try = *(Tokens.end() - 2);
  401. if (!Try->is(tok::kw_try))
  402. return false;
  403. auto &Next = *(Tokens.end() - 1);
  404. if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
  405. return false;
  406. if (Tokens.size() > 2) {
  407. auto &At = *(Tokens.end() - 3);
  408. if (At->is(tok::at))
  409. return false;
  410. }
  411. Try->Tok.setKind(tok::identifier);
  412. return true;
  413. }
  414. bool FormatTokenLexer::tryMergeLessLess() {
  415. // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
  416. if (Tokens.size() < 3)
  417. return false;
  418. auto First = Tokens.end() - 3;
  419. if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
  420. return false;
  421. // Only merge if there currently is no whitespace between the two "<".
  422. if (First[1]->hasWhitespaceBefore())
  423. return false;
  424. auto X = Tokens.size() > 3 ? First[-1] : nullptr;
  425. auto Y = First[2];
  426. if ((X && X->is(tok::less)) || Y->is(tok::less))
  427. return false;
  428. // Do not remove a whitespace between the two "<" e.g. "operator< <>".
  429. if (X && X->is(tok::kw_operator) && Y->is(tok::greater))
  430. return false;
  431. First[0]->Tok.setKind(tok::lessless);
  432. First[0]->TokenText = "<<";
  433. First[0]->ColumnWidth += 1;
  434. Tokens.erase(Tokens.end() - 2);
  435. return true;
  436. }
  437. bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
  438. TokenType NewType) {
  439. if (Tokens.size() < Kinds.size())
  440. return false;
  441. SmallVectorImpl<FormatToken *>::const_iterator First =
  442. Tokens.end() - Kinds.size();
  443. for (unsigned i = 0; i < Kinds.size(); ++i)
  444. if (!First[i]->is(Kinds[i]))
  445. return false;
  446. return tryMergeTokens(Kinds.size(), NewType);
  447. }
  448. bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
  449. if (Tokens.size() < Count)
  450. return false;
  451. SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
  452. unsigned AddLength = 0;
  453. for (size_t i = 1; i < Count; ++i) {
  454. // If there is whitespace separating the token and the previous one,
  455. // they should not be merged.
  456. if (First[i]->hasWhitespaceBefore())
  457. return false;
  458. AddLength += First[i]->TokenText.size();
  459. }
  460. Tokens.resize(Tokens.size() - Count + 1);
  461. First[0]->TokenText = StringRef(First[0]->TokenText.data(),
  462. First[0]->TokenText.size() + AddLength);
  463. First[0]->ColumnWidth += AddLength;
  464. First[0]->setType(NewType);
  465. return true;
  466. }
  467. bool FormatTokenLexer::tryMergeTokensAny(
  468. ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
  469. return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
  470. return tryMergeTokens(Kinds, NewType);
  471. });
  472. }
  473. // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
  474. bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
  475. // NB: This is not entirely correct, as an r_paren can introduce an operand
  476. // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
  477. // corner case to not matter in practice, though.
  478. return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
  479. tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
  480. tok::colon, tok::question, tok::tilde) ||
  481. Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
  482. tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
  483. tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
  484. Tok->isBinaryOperator();
  485. }
  486. bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
  487. if (!Prev)
  488. return true;
  489. // Regex literals can only follow after prefix unary operators, not after
  490. // postfix unary operators. If the '++' is followed by a non-operand
  491. // introducing token, the slash here is the operand and not the start of a
  492. // regex.
  493. // `!` is an unary prefix operator, but also a post-fix operator that casts
  494. // away nullability, so the same check applies.
  495. if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
  496. return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
  497. // The previous token must introduce an operand location where regex
  498. // literals can occur.
  499. if (!precedesOperand(Prev))
  500. return false;
  501. return true;
  502. }
  503. // Tries to parse a JavaScript Regex literal starting at the current token,
  504. // if that begins with a slash and is in a location where JavaScript allows
  505. // regex literals. Changes the current token to a regex literal and updates
  506. // its text if successful.
  507. void FormatTokenLexer::tryParseJSRegexLiteral() {
  508. FormatToken *RegexToken = Tokens.back();
  509. if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
  510. return;
  511. FormatToken *Prev = nullptr;
  512. for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
  513. // NB: Because previous pointers are not initialized yet, this cannot use
  514. // Token.getPreviousNonComment.
  515. if (FT->isNot(tok::comment)) {
  516. Prev = FT;
  517. break;
  518. }
  519. }
  520. if (!canPrecedeRegexLiteral(Prev))
  521. return;
  522. // 'Manually' lex ahead in the current file buffer.
  523. const char *Offset = Lex->getBufferLocation();
  524. const char *RegexBegin = Offset - RegexToken->TokenText.size();
  525. StringRef Buffer = Lex->getBuffer();
  526. bool InCharacterClass = false;
  527. bool HaveClosingSlash = false;
  528. for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
  529. // Regular expressions are terminated with a '/', which can only be
  530. // escaped using '\' or a character class between '[' and ']'.
  531. // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
  532. switch (*Offset) {
  533. case '\\':
  534. // Skip the escaped character.
  535. ++Offset;
  536. break;
  537. case '[':
  538. InCharacterClass = true;
  539. break;
  540. case ']':
  541. InCharacterClass = false;
  542. break;
  543. case '/':
  544. if (!InCharacterClass)
  545. HaveClosingSlash = true;
  546. break;
  547. }
  548. }
  549. RegexToken->setType(TT_RegexLiteral);
  550. // Treat regex literals like other string_literals.
  551. RegexToken->Tok.setKind(tok::string_literal);
  552. RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
  553. RegexToken->ColumnWidth = RegexToken->TokenText.size();
  554. resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
  555. }
  556. static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
  557. bool Interpolated) {
  558. auto Repeated = [&Begin, End]() {
  559. return Begin + 1 < End && Begin[1] == Begin[0];
  560. };
  561. // Look for a terminating '"' in the current file buffer.
  562. // Make no effort to format code within an interpolated or verbatim string.
  563. //
  564. // Interpolated strings could contain { } with " characters inside.
  565. // $"{x ?? "null"}"
  566. // should not be split into $"{x ?? ", null, "}" but should be treated as a
  567. // single string-literal.
  568. //
  569. // We opt not to try and format expressions inside {} within a C#
  570. // interpolated string. Formatting expressions within an interpolated string
  571. // would require similar work as that done for JavaScript template strings
  572. // in `handleTemplateStrings()`.
  573. for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
  574. switch (*Begin) {
  575. case '\\':
  576. if (!Verbatim)
  577. ++Begin;
  578. break;
  579. case '{':
  580. if (Interpolated) {
  581. // {{ inside an interpolated string is escaped, so skip it.
  582. if (Repeated())
  583. ++Begin;
  584. else
  585. ++UnmatchedOpeningBraceCount;
  586. }
  587. break;
  588. case '}':
  589. if (Interpolated) {
  590. // }} inside an interpolated string is escaped, so skip it.
  591. if (Repeated())
  592. ++Begin;
  593. else if (UnmatchedOpeningBraceCount > 0)
  594. --UnmatchedOpeningBraceCount;
  595. else
  596. return End;
  597. }
  598. break;
  599. case '"':
  600. if (UnmatchedOpeningBraceCount > 0)
  601. break;
  602. // "" within a verbatim string is an escaped double quote: skip it.
  603. if (Verbatim && Repeated()) {
  604. ++Begin;
  605. break;
  606. }
  607. return Begin;
  608. }
  609. }
  610. return End;
  611. }
  612. void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
  613. FormatToken *CSharpStringLiteral = Tokens.back();
  614. if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
  615. return;
  616. auto &TokenText = CSharpStringLiteral->TokenText;
  617. bool Verbatim = false;
  618. bool Interpolated = false;
  619. if (TokenText.startswith(R"($@")") || TokenText.startswith(R"(@$")")) {
  620. Verbatim = true;
  621. Interpolated = true;
  622. } else if (TokenText.startswith(R"(@")")) {
  623. Verbatim = true;
  624. } else if (TokenText.startswith(R"($")")) {
  625. Interpolated = true;
  626. }
  627. // Deal with multiline strings.
  628. if (!Verbatim && !Interpolated)
  629. return;
  630. const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
  631. const char *Offset = StrBegin;
  632. if (Verbatim && Interpolated)
  633. Offset += 3;
  634. else
  635. Offset += 2;
  636. const auto End = Lex->getBuffer().end();
  637. Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
  638. // Make no attempt to format code properly if a verbatim string is
  639. // unterminated.
  640. if (Offset >= End)
  641. return;
  642. StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
  643. TokenText = LiteralText;
  644. // Adjust width for potentially multiline string literals.
  645. size_t FirstBreak = LiteralText.find('\n');
  646. StringRef FirstLineText = FirstBreak == StringRef::npos
  647. ? LiteralText
  648. : LiteralText.substr(0, FirstBreak);
  649. CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
  650. FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
  651. Encoding);
  652. size_t LastBreak = LiteralText.rfind('\n');
  653. if (LastBreak != StringRef::npos) {
  654. CSharpStringLiteral->IsMultiline = true;
  655. unsigned StartColumn = 0;
  656. CSharpStringLiteral->LastLineColumnWidth =
  657. encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
  658. StartColumn, Style.TabWidth, Encoding);
  659. }
  660. assert(Offset < End);
  661. resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
  662. }
  663. void FormatTokenLexer::handleTemplateStrings() {
  664. FormatToken *BacktickToken = Tokens.back();
  665. if (BacktickToken->is(tok::l_brace)) {
  666. StateStack.push(LexerState::NORMAL);
  667. return;
  668. }
  669. if (BacktickToken->is(tok::r_brace)) {
  670. if (StateStack.size() == 1)
  671. return;
  672. StateStack.pop();
  673. if (StateStack.top() != LexerState::TEMPLATE_STRING)
  674. return;
  675. // If back in TEMPLATE_STRING, fallthrough and continue parsing the
  676. } else if (BacktickToken->is(tok::unknown) &&
  677. BacktickToken->TokenText == "`") {
  678. StateStack.push(LexerState::TEMPLATE_STRING);
  679. } else {
  680. return; // Not actually a template
  681. }
  682. // 'Manually' lex ahead in the current file buffer.
  683. const char *Offset = Lex->getBufferLocation();
  684. const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
  685. for (; Offset != Lex->getBuffer().end(); ++Offset) {
  686. if (Offset[0] == '`') {
  687. StateStack.pop();
  688. ++Offset;
  689. break;
  690. }
  691. if (Offset[0] == '\\') {
  692. ++Offset; // Skip the escaped character.
  693. } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
  694. Offset[1] == '{') {
  695. // '${' introduces an expression interpolation in the template string.
  696. StateStack.push(LexerState::NORMAL);
  697. Offset += 2;
  698. break;
  699. }
  700. }
  701. StringRef LiteralText(TmplBegin, Offset - TmplBegin);
  702. BacktickToken->setType(TT_TemplateString);
  703. BacktickToken->Tok.setKind(tok::string_literal);
  704. BacktickToken->TokenText = LiteralText;
  705. // Adjust width for potentially multiline string literals.
  706. size_t FirstBreak = LiteralText.find('\n');
  707. StringRef FirstLineText = FirstBreak == StringRef::npos
  708. ? LiteralText
  709. : LiteralText.substr(0, FirstBreak);
  710. BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
  711. FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
  712. size_t LastBreak = LiteralText.rfind('\n');
  713. if (LastBreak != StringRef::npos) {
  714. BacktickToken->IsMultiline = true;
  715. unsigned StartColumn = 0; // The template tail spans the entire line.
  716. BacktickToken->LastLineColumnWidth =
  717. encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
  718. StartColumn, Style.TabWidth, Encoding);
  719. }
  720. SourceLocation loc = Lex->getSourceLocation(Offset);
  721. resetLexer(SourceMgr.getFileOffset(loc));
  722. }
  723. void FormatTokenLexer::tryParsePythonComment() {
  724. FormatToken *HashToken = Tokens.back();
  725. if (!HashToken->isOneOf(tok::hash, tok::hashhash))
  726. return;
  727. // Turn the remainder of this line into a comment.
  728. const char *CommentBegin =
  729. Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
  730. size_t From = CommentBegin - Lex->getBuffer().begin();
  731. size_t To = Lex->getBuffer().find_first_of('\n', From);
  732. if (To == StringRef::npos)
  733. To = Lex->getBuffer().size();
  734. size_t Len = To - From;
  735. HashToken->setType(TT_LineComment);
  736. HashToken->Tok.setKind(tok::comment);
  737. HashToken->TokenText = Lex->getBuffer().substr(From, Len);
  738. SourceLocation Loc = To < Lex->getBuffer().size()
  739. ? Lex->getSourceLocation(CommentBegin + Len)
  740. : SourceMgr.getLocForEndOfFile(ID);
  741. resetLexer(SourceMgr.getFileOffset(Loc));
  742. }
  743. bool FormatTokenLexer::tryMerge_TMacro() {
  744. if (Tokens.size() < 4)
  745. return false;
  746. FormatToken *Last = Tokens.back();
  747. if (!Last->is(tok::r_paren))
  748. return false;
  749. FormatToken *String = Tokens[Tokens.size() - 2];
  750. if (!String->is(tok::string_literal) || String->IsMultiline)
  751. return false;
  752. if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
  753. return false;
  754. FormatToken *Macro = Tokens[Tokens.size() - 4];
  755. if (Macro->TokenText != "_T")
  756. return false;
  757. const char *Start = Macro->TokenText.data();
  758. const char *End = Last->TokenText.data() + Last->TokenText.size();
  759. String->TokenText = StringRef(Start, End - Start);
  760. String->IsFirst = Macro->IsFirst;
  761. String->LastNewlineOffset = Macro->LastNewlineOffset;
  762. String->WhitespaceRange = Macro->WhitespaceRange;
  763. String->OriginalColumn = Macro->OriginalColumn;
  764. String->ColumnWidth = encoding::columnWidthWithTabs(
  765. String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
  766. String->NewlinesBefore = Macro->NewlinesBefore;
  767. String->HasUnescapedNewline = Macro->HasUnescapedNewline;
  768. Tokens.pop_back();
  769. Tokens.pop_back();
  770. Tokens.pop_back();
  771. Tokens.back() = String;
  772. if (FirstInLineIndex >= Tokens.size())
  773. FirstInLineIndex = Tokens.size() - 1;
  774. return true;
  775. }
  776. bool FormatTokenLexer::tryMergeConflictMarkers() {
  777. if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
  778. return false;
  779. // Conflict lines look like:
  780. // <marker> <text from the vcs>
  781. // For example:
  782. // >>>>>>> /file/in/file/system at revision 1234
  783. //
  784. // We merge all tokens in a line that starts with a conflict marker
  785. // into a single token with a special token type that the unwrapped line
  786. // parser will use to correctly rebuild the underlying code.
  787. FileID ID;
  788. // Get the position of the first token in the line.
  789. unsigned FirstInLineOffset;
  790. std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
  791. Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
  792. StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
  793. // Calculate the offset of the start of the current line.
  794. auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
  795. if (LineOffset == StringRef::npos)
  796. LineOffset = 0;
  797. else
  798. ++LineOffset;
  799. auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
  800. StringRef LineStart;
  801. if (FirstSpace == StringRef::npos)
  802. LineStart = Buffer.substr(LineOffset);
  803. else
  804. LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
  805. TokenType Type = TT_Unknown;
  806. if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
  807. Type = TT_ConflictStart;
  808. } else if (LineStart == "|||||||" || LineStart == "=======" ||
  809. LineStart == "====") {
  810. Type = TT_ConflictAlternative;
  811. } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
  812. Type = TT_ConflictEnd;
  813. }
  814. if (Type != TT_Unknown) {
  815. FormatToken *Next = Tokens.back();
  816. Tokens.resize(FirstInLineIndex + 1);
  817. // We do not need to build a complete token here, as we will skip it
  818. // during parsing anyway (as we must not touch whitespace around conflict
  819. // markers).
  820. Tokens.back()->setType(Type);
  821. Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
  822. Tokens.push_back(Next);
  823. return true;
  824. }
  825. return false;
  826. }
  827. FormatToken *FormatTokenLexer::getStashedToken() {
  828. // Create a synthesized second '>' or '<' token.
  829. Token Tok = FormatTok->Tok;
  830. StringRef TokenText = FormatTok->TokenText;
  831. unsigned OriginalColumn = FormatTok->OriginalColumn;
  832. FormatTok = new (Allocator.Allocate()) FormatToken;
  833. FormatTok->Tok = Tok;
  834. SourceLocation TokLocation =
  835. FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
  836. FormatTok->Tok.setLocation(TokLocation);
  837. FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
  838. FormatTok->TokenText = TokenText;
  839. FormatTok->ColumnWidth = 1;
  840. FormatTok->OriginalColumn = OriginalColumn + 1;
  841. return FormatTok;
  842. }
  843. /// Truncate the current token to the new length and make the lexer continue
  844. /// from the end of the truncated token. Used for other languages that have
  845. /// different token boundaries, like JavaScript in which a comment ends at a
  846. /// line break regardless of whether the line break follows a backslash. Also
  847. /// used to set the lexer to the end of whitespace if the lexer regards
  848. /// whitespace and an unrecognized symbol as one token.
  849. void FormatTokenLexer::truncateToken(size_t NewLen) {
  850. assert(NewLen <= FormatTok->TokenText.size());
  851. resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
  852. Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
  853. FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
  854. FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
  855. FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
  856. Encoding);
  857. FormatTok->Tok.setLength(NewLen);
  858. }
  859. /// Count the length of leading whitespace in a token.
  860. static size_t countLeadingWhitespace(StringRef Text) {
  861. // Basically counting the length matched by this regex.
  862. // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
  863. // Directly using the regex turned out to be slow. With the regex
  864. // version formatting all files in this directory took about 1.25
  865. // seconds. This version took about 0.5 seconds.
  866. const unsigned char *const Begin = Text.bytes_begin();
  867. const unsigned char *const End = Text.bytes_end();
  868. const unsigned char *Cur = Begin;
  869. while (Cur < End) {
  870. if (isspace(Cur[0])) {
  871. ++Cur;
  872. } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
  873. // A '\' followed by a newline always escapes the newline, regardless
  874. // of whether there is another '\' before it.
  875. // The source has a null byte at the end. So the end of the entire input
  876. // isn't reached yet. Also the lexer doesn't break apart an escaped
  877. // newline.
  878. assert(End - Cur >= 2);
  879. Cur += 2;
  880. } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
  881. (Cur[3] == '\n' || Cur[3] == '\r')) {
  882. // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
  883. // characters are quoted individually in this comment because if we write
  884. // them together some compilers warn that we have a trigraph in the code.
  885. assert(End - Cur >= 4);
  886. Cur += 4;
  887. } else {
  888. break;
  889. }
  890. }
  891. return Cur - Begin;
  892. }
  893. FormatToken *FormatTokenLexer::getNextToken() {
  894. if (StateStack.top() == LexerState::TOKEN_STASHED) {
  895. StateStack.pop();
  896. return getStashedToken();
  897. }
  898. FormatTok = new (Allocator.Allocate()) FormatToken;
  899. readRawToken(*FormatTok);
  900. SourceLocation WhitespaceStart =
  901. FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
  902. FormatTok->IsFirst = IsFirstToken;
  903. IsFirstToken = false;
  904. // Consume and record whitespace until we find a significant token.
  905. // Some tok::unknown tokens are not just whitespace, e.g. whitespace
  906. // followed by a symbol such as backtick. Those symbols may be
  907. // significant in other languages.
  908. unsigned WhitespaceLength = TrailingWhitespace;
  909. while (FormatTok->isNot(tok::eof)) {
  910. auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
  911. if (LeadingWhitespace == 0)
  912. break;
  913. if (LeadingWhitespace < FormatTok->TokenText.size())
  914. truncateToken(LeadingWhitespace);
  915. StringRef Text = FormatTok->TokenText;
  916. bool InEscape = false;
  917. for (int i = 0, e = Text.size(); i != e; ++i) {
  918. switch (Text[i]) {
  919. case '\r':
  920. // If this is a CRLF sequence, break here and the LF will be handled on
  921. // the next loop iteration. Otherwise, this is a single Mac CR, treat it
  922. // the same as a single LF.
  923. if (i + 1 < e && Text[i + 1] == '\n')
  924. break;
  925. [[fallthrough]];
  926. case '\n':
  927. ++FormatTok->NewlinesBefore;
  928. if (!InEscape)
  929. FormatTok->HasUnescapedNewline = true;
  930. else
  931. InEscape = false;
  932. FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
  933. Column = 0;
  934. break;
  935. case '\f':
  936. case '\v':
  937. Column = 0;
  938. break;
  939. case ' ':
  940. ++Column;
  941. break;
  942. case '\t':
  943. Column +=
  944. Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
  945. break;
  946. case '\\':
  947. case '?':
  948. case '/':
  949. // The text was entirely whitespace when this loop was entered. Thus
  950. // this has to be an escape sequence.
  951. assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
  952. Text.substr(i, 4) == "\?\?/\r" ||
  953. Text.substr(i, 4) == "\?\?/\n" ||
  954. (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
  955. Text.substr(i - 1, 4) == "\?\?/\n")) ||
  956. (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
  957. Text.substr(i - 2, 4) == "\?\?/\n")));
  958. InEscape = true;
  959. break;
  960. default:
  961. // This shouldn't happen.
  962. assert(false);
  963. break;
  964. }
  965. }
  966. WhitespaceLength += Text.size();
  967. readRawToken(*FormatTok);
  968. }
  969. if (FormatTok->is(tok::unknown))
  970. FormatTok->setType(TT_ImplicitStringLiteral);
  971. // JavaScript and Java do not allow to escape the end of the line with a
  972. // backslash. Backslashes are syntax errors in plain source, but can occur in
  973. // comments. When a single line comment ends with a \, it'll cause the next
  974. // line of code to be lexed as a comment, breaking formatting. The code below
  975. // finds comments that contain a backslash followed by a line break, truncates
  976. // the comment token at the backslash, and resets the lexer to restart behind
  977. // the backslash.
  978. if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
  979. FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
  980. size_t BackslashPos = FormatTok->TokenText.find('\\');
  981. while (BackslashPos != StringRef::npos) {
  982. if (BackslashPos + 1 < FormatTok->TokenText.size() &&
  983. FormatTok->TokenText[BackslashPos + 1] == '\n') {
  984. truncateToken(BackslashPos + 1);
  985. break;
  986. }
  987. BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
  988. }
  989. }
  990. if (Style.isVerilog()) {
  991. static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
  992. SmallVector<StringRef, 1> Matches;
  993. // Verilog uses the backtick instead of the hash for preprocessor stuff.
  994. // And it uses the hash for delays and parameter lists. In order to continue
  995. // using `tok::hash` in other places, the backtick gets marked as the hash
  996. // here. And in order to tell the backtick and hash apart for
  997. // Verilog-specific stuff, the hash becomes an identifier.
  998. if (FormatTok->is(tok::numeric_constant)) {
  999. // In Verilog the quote is not part of a number.
  1000. auto Quote = FormatTok->TokenText.find('\'');
  1001. if (Quote != StringRef::npos)
  1002. truncateToken(Quote);
  1003. } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
  1004. FormatTok->Tok.setKind(tok::raw_identifier);
  1005. } else if (FormatTok->is(tok::raw_identifier)) {
  1006. if (FormatTok->TokenText == "`") {
  1007. FormatTok->Tok.setIdentifierInfo(nullptr);
  1008. FormatTok->Tok.setKind(tok::hash);
  1009. } else if (FormatTok->TokenText == "``") {
  1010. FormatTok->Tok.setIdentifierInfo(nullptr);
  1011. FormatTok->Tok.setKind(tok::hashhash);
  1012. } else if (Tokens.size() > 0 &&
  1013. Tokens.back()->is(Keywords.kw_apostrophe) &&
  1014. NumberBase.match(FormatTok->TokenText, &Matches)) {
  1015. // In Verilog in a based number literal like `'b10`, there may be
  1016. // whitespace between `'b` and `10`. Therefore we handle the base and
  1017. // the rest of the number literal as two tokens. But if there is no
  1018. // space in the input code, we need to manually separate the two parts.
  1019. truncateToken(Matches[0].size());
  1020. FormatTok->setFinalizedType(TT_VerilogNumberBase);
  1021. }
  1022. }
  1023. }
  1024. FormatTok->WhitespaceRange = SourceRange(
  1025. WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
  1026. FormatTok->OriginalColumn = Column;
  1027. TrailingWhitespace = 0;
  1028. if (FormatTok->is(tok::comment)) {
  1029. // FIXME: Add the trimmed whitespace to Column.
  1030. StringRef UntrimmedText = FormatTok->TokenText;
  1031. FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
  1032. TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
  1033. } else if (FormatTok->is(tok::raw_identifier)) {
  1034. IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
  1035. FormatTok->Tok.setIdentifierInfo(&Info);
  1036. FormatTok->Tok.setKind(Info.getTokenID());
  1037. if (Style.Language == FormatStyle::LK_Java &&
  1038. FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
  1039. tok::kw_operator)) {
  1040. FormatTok->Tok.setKind(tok::identifier);
  1041. FormatTok->Tok.setIdentifierInfo(nullptr);
  1042. } else if (Style.isJavaScript() &&
  1043. FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
  1044. tok::kw_operator)) {
  1045. FormatTok->Tok.setKind(tok::identifier);
  1046. FormatTok->Tok.setIdentifierInfo(nullptr);
  1047. }
  1048. } else if (FormatTok->is(tok::greatergreater)) {
  1049. FormatTok->Tok.setKind(tok::greater);
  1050. FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
  1051. ++Column;
  1052. StateStack.push(LexerState::TOKEN_STASHED);
  1053. } else if (FormatTok->is(tok::lessless)) {
  1054. FormatTok->Tok.setKind(tok::less);
  1055. FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
  1056. ++Column;
  1057. StateStack.push(LexerState::TOKEN_STASHED);
  1058. }
  1059. if (Style.isVerilog() && Tokens.size() > 0 &&
  1060. Tokens.back()->is(TT_VerilogNumberBase) &&
  1061. FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
  1062. // Mark the number following a base like `'h?a0` as a number.
  1063. FormatTok->Tok.setKind(tok::numeric_constant);
  1064. }
  1065. // Now FormatTok is the next non-whitespace token.
  1066. StringRef Text = FormatTok->TokenText;
  1067. size_t FirstNewlinePos = Text.find('\n');
  1068. if (FirstNewlinePos == StringRef::npos) {
  1069. // FIXME: ColumnWidth actually depends on the start column, we need to
  1070. // take this into account when the token is moved.
  1071. FormatTok->ColumnWidth =
  1072. encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
  1073. Column += FormatTok->ColumnWidth;
  1074. } else {
  1075. FormatTok->IsMultiline = true;
  1076. // FIXME: ColumnWidth actually depends on the start column, we need to
  1077. // take this into account when the token is moved.
  1078. FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
  1079. Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
  1080. // The last line of the token always starts in column 0.
  1081. // Thus, the length can be precomputed even in the presence of tabs.
  1082. FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
  1083. Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
  1084. Column = FormatTok->LastLineColumnWidth;
  1085. }
  1086. if (Style.isCpp()) {
  1087. auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
  1088. if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
  1089. Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
  1090. tok::pp_define) &&
  1091. it != Macros.end()) {
  1092. FormatTok->setType(it->second);
  1093. if (it->second == TT_IfMacro) {
  1094. // The lexer token currently has type tok::kw_unknown. However, for this
  1095. // substitution to be treated correctly in the TokenAnnotator, faking
  1096. // the tok value seems to be needed. Not sure if there's a more elegant
  1097. // way.
  1098. FormatTok->Tok.setKind(tok::kw_if);
  1099. }
  1100. } else if (FormatTok->is(tok::identifier)) {
  1101. if (MacroBlockBeginRegex.match(Text))
  1102. FormatTok->setType(TT_MacroBlockBegin);
  1103. else if (MacroBlockEndRegex.match(Text))
  1104. FormatTok->setType(TT_MacroBlockEnd);
  1105. }
  1106. }
  1107. return FormatTok;
  1108. }
  1109. bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
  1110. // In Verilog the quote is not a character literal.
  1111. //
  1112. // Make the backtick and double backtick identifiers to match against them
  1113. // more easily.
  1114. //
  1115. // In Verilog an escaped identifier starts with backslash and ends with
  1116. // whitespace. Unless that whitespace is an escaped newline. A backslash can
  1117. // also begin an escaped newline outside of an escaped identifier. We check
  1118. // for that outside of the Regex since we can't use negative lookhead
  1119. // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
  1120. // identifier may have a length of 0 according to Section A.9.3.
  1121. // FIXME: If there is an escaped newline in the middle of an escaped
  1122. // identifier, allow for pasting the two lines together, But escaped
  1123. // identifiers usually occur only in generated code anyway.
  1124. static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
  1125. "(\r?\n|\r)|[^[:space:]])*)");
  1126. SmallVector<StringRef, 4> Matches;
  1127. const char *Start = Lex->getBufferLocation();
  1128. if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
  1129. &Matches)) {
  1130. return false;
  1131. }
  1132. // There is a null byte at the end of the buffer, so we don't have to check
  1133. // Start[1] is within the buffer.
  1134. if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
  1135. return false;
  1136. size_t Len = Matches[0].size();
  1137. // The kind has to be an identifier so we can match it against those defined
  1138. // in Keywords. The kind has to be set before the length because the setLength
  1139. // function checks that the kind is not an annotation.
  1140. Tok.setKind(tok::raw_identifier);
  1141. Tok.setLength(Len);
  1142. Tok.setLocation(Lex->getSourceLocation(Start, Len));
  1143. Tok.setRawIdentifierData(Start);
  1144. Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
  1145. return true;
  1146. }
  1147. void FormatTokenLexer::readRawToken(FormatToken &Tok) {
  1148. // For Verilog, first see if there is a special token, and fall back to the
  1149. // normal lexer if there isn't one.
  1150. if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
  1151. Lex->LexFromRawLexer(Tok.Tok);
  1152. Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
  1153. Tok.Tok.getLength());
  1154. // For formatting, treat unterminated string literals like normal string
  1155. // literals.
  1156. if (Tok.is(tok::unknown)) {
  1157. if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
  1158. Tok.Tok.setKind(tok::string_literal);
  1159. Tok.IsUnterminatedLiteral = true;
  1160. } else if (Style.isJavaScript() && Tok.TokenText == "''") {
  1161. Tok.Tok.setKind(tok::string_literal);
  1162. }
  1163. }
  1164. if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto ||
  1165. Style.Language == FormatStyle::LK_TextProto) &&
  1166. Tok.is(tok::char_constant)) {
  1167. Tok.Tok.setKind(tok::string_literal);
  1168. }
  1169. if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
  1170. Tok.TokenText == "/* clang-format on */")) {
  1171. FormattingDisabled = false;
  1172. }
  1173. Tok.Finalized = FormattingDisabled;
  1174. if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
  1175. Tok.TokenText == "/* clang-format off */")) {
  1176. FormattingDisabled = true;
  1177. }
  1178. }
  1179. void FormatTokenLexer::resetLexer(unsigned Offset) {
  1180. StringRef Buffer = SourceMgr.getBufferData(ID);
  1181. LangOpts = getFormattingLangOpts(Style);
  1182. Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
  1183. Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
  1184. Lex->SetKeepWhitespaceMode(true);
  1185. TrailingWhitespace = 0;
  1186. }
  1187. } // namespace format
  1188. } // namespace clang