LiteralSupport.cpp 78 KB


  1. //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the NumericLiteralParser, CharLiteralParser, and
  10. // StringLiteralParser interfaces.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "clang/Lex/LiteralSupport.h"
  14. #include "clang/Basic/CharInfo.h"
  15. #include "clang/Basic/LangOptions.h"
  16. #include "clang/Basic/SourceLocation.h"
  17. #include "clang/Basic/TargetInfo.h"
  18. #include "clang/Lex/LexDiagnostic.h"
  19. #include "clang/Lex/Lexer.h"
  20. #include "clang/Lex/Preprocessor.h"
  21. #include "clang/Lex/Token.h"
  22. #include "llvm/ADT/APInt.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/StringExtras.h"
  25. #include "llvm/ADT/StringSwitch.h"
  26. #include "llvm/Support/ConvertUTF.h"
  27. #include "llvm/Support/Error.h"
  28. #include "llvm/Support/ErrorHandling.h"
  29. #include "llvm/Support/Unicode.h"
  30. #include <algorithm>
  31. #include <cassert>
  32. #include <cstddef>
  33. #include <cstdint>
  34. #include <cstring>
  35. #include <string>
  36. using namespace clang;
  37. static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
  38. switch (kind) {
  39. default: llvm_unreachable("Unknown token type!");
  40. case tok::char_constant:
  41. case tok::string_literal:
  42. case tok::utf8_char_constant:
  43. case tok::utf8_string_literal:
  44. return Target.getCharWidth();
  45. case tok::wide_char_constant:
  46. case tok::wide_string_literal:
  47. return Target.getWCharWidth();
  48. case tok::utf16_char_constant:
  49. case tok::utf16_string_literal:
  50. return Target.getChar16Width();
  51. case tok::utf32_char_constant:
  52. case tok::utf32_string_literal:
  53. return Target.getChar32Width();
  54. }
  55. }
  56. static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
  57. FullSourceLoc TokLoc,
  58. const char *TokBegin,
  59. const char *TokRangeBegin,
  60. const char *TokRangeEnd) {
  61. SourceLocation Begin =
  62. Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  63. TokLoc.getManager(), Features);
  64. SourceLocation End =
  65. Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
  66. TokLoc.getManager(), Features);
  67. return CharSourceRange::getCharRange(Begin, End);
  68. }
  69. /// Produce a diagnostic highlighting some portion of a literal.
  70. ///
  71. /// Emits the diagnostic \p DiagID, highlighting the range of characters from
  72. /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
  73. /// a substring of a spelling buffer for the token beginning at \p TokBegin.
  74. static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
  75. const LangOptions &Features, FullSourceLoc TokLoc,
  76. const char *TokBegin, const char *TokRangeBegin,
  77. const char *TokRangeEnd, unsigned DiagID) {
  78. SourceLocation Begin =
  79. Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  80. TokLoc.getManager(), Features);
  81. return Diags->Report(Begin, DiagID) <<
  82. MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
  83. }
  84. /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  85. /// either a character or a string literal.
  86. static unsigned ProcessCharEscape(const char *ThisTokBegin,
  87. const char *&ThisTokBuf,
  88. const char *ThisTokEnd, bool &HadError,
  89. FullSourceLoc Loc, unsigned CharWidth,
  90. DiagnosticsEngine *Diags,
  91. const LangOptions &Features) {
  92. const char *EscapeBegin = ThisTokBuf;
  93. bool Delimited = false;
  94. bool EndDelimiterFound = false;
  95. // Skip the '\' char.
  96. ++ThisTokBuf;
  97. // We know that this character can't be off the end of the buffer, because
  98. // that would have been \", which would not have been the end of string.
  99. unsigned ResultChar = *ThisTokBuf++;
  100. switch (ResultChar) {
  101. // These map to themselves.
  102. case '\\': case '\'': case '"': case '?': break;
  103. // These have fixed mappings.
  104. case 'a':
  105. // TODO: K&R: the meaning of '\\a' is different in traditional C
  106. ResultChar = 7;
  107. break;
  108. case 'b':
  109. ResultChar = 8;
  110. break;
  111. case 'e':
  112. if (Diags)
  113. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  114. diag::ext_nonstandard_escape) << "e";
  115. ResultChar = 27;
  116. break;
  117. case 'E':
  118. if (Diags)
  119. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  120. diag::ext_nonstandard_escape) << "E";
  121. ResultChar = 27;
  122. break;
  123. case 'f':
  124. ResultChar = 12;
  125. break;
  126. case 'n':
  127. ResultChar = 10;
  128. break;
  129. case 'r':
  130. ResultChar = 13;
  131. break;
  132. case 't':
  133. ResultChar = 9;
  134. break;
  135. case 'v':
  136. ResultChar = 11;
  137. break;
  138. case 'x': { // Hex escape.
  139. ResultChar = 0;
  140. if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
  141. Delimited = true;
  142. ThisTokBuf++;
  143. if (*ThisTokBuf == '}') {
  144. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  145. diag::err_delimited_escape_empty);
  146. return ResultChar;
  147. }
  148. } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
  149. if (Diags)
  150. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  151. diag::err_hex_escape_no_digits) << "x";
  152. return ResultChar;
  153. }
  154. // Hex escapes are a maximal series of hex digits.
  155. bool Overflow = false;
  156. for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
  157. if (Delimited && *ThisTokBuf == '}') {
  158. ThisTokBuf++;
  159. EndDelimiterFound = true;
  160. break;
  161. }
  162. int CharVal = llvm::hexDigitValue(*ThisTokBuf);
  163. if (CharVal == -1) {
  164. // Non delimited hex escape sequences stop at the first non-hex digit.
  165. if (!Delimited)
  166. break;
  167. HadError = true;
  168. if (Diags)
  169. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  170. diag::err_delimited_escape_invalid)
  171. << StringRef(ThisTokBuf, 1);
  172. continue;
  173. }
  174. // About to shift out a digit?
  175. if (ResultChar & 0xF0000000)
  176. Overflow = true;
  177. ResultChar <<= 4;
  178. ResultChar |= CharVal;
  179. }
  180. // See if any bits will be truncated when evaluated as a character.
  181. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
  182. Overflow = true;
  183. ResultChar &= ~0U >> (32-CharWidth);
  184. }
  185. // Check for overflow.
  186. if (!HadError && Overflow) { // Too many digits to fit in
  187. HadError = true;
  188. if (Diags)
  189. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  190. diag::err_escape_too_large)
  191. << 0;
  192. }
  193. break;
  194. }
  195. case '0': case '1': case '2': case '3':
  196. case '4': case '5': case '6': case '7': {
  197. // Octal escapes.
  198. --ThisTokBuf;
  199. ResultChar = 0;
  200. // Octal escapes are a series of octal digits with maximum length 3.
  201. // "\0123" is a two digit sequence equal to "\012" "3".
  202. unsigned NumDigits = 0;
  203. do {
  204. ResultChar <<= 3;
  205. ResultChar |= *ThisTokBuf++ - '0';
  206. ++NumDigits;
  207. } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
  208. ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
  209. // Check for overflow. Reject '\777', but not L'\777'.
  210. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
  211. if (Diags)
  212. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  213. diag::err_escape_too_large) << 1;
  214. ResultChar &= ~0U >> (32-CharWidth);
  215. }
  216. break;
  217. }
  218. case 'o': {
  219. bool Overflow = false;
  220. if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
  221. HadError = true;
  222. if (Diags)
  223. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  224. diag::err_delimited_escape_missing_brace)
  225. << "o";
  226. break;
  227. }
  228. ResultChar = 0;
  229. Delimited = true;
  230. ++ThisTokBuf;
  231. if (*ThisTokBuf == '}') {
  232. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  233. diag::err_delimited_escape_empty);
  234. return ResultChar;
  235. }
  236. while (ThisTokBuf != ThisTokEnd) {
  237. if (*ThisTokBuf == '}') {
  238. EndDelimiterFound = true;
  239. ThisTokBuf++;
  240. break;
  241. }
  242. if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
  243. HadError = true;
  244. if (Diags)
  245. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  246. diag::err_delimited_escape_invalid)
  247. << StringRef(ThisTokBuf, 1);
  248. ThisTokBuf++;
  249. continue;
  250. }
  251. if (ResultChar & 0x020000000)
  252. Overflow = true;
  253. ResultChar <<= 3;
  254. ResultChar |= *ThisTokBuf++ - '0';
  255. }
  256. // Check for overflow. Reject '\777', but not L'\777'.
  257. if (!HadError &&
  258. (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
  259. HadError = true;
  260. if (Diags)
  261. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  262. diag::err_escape_too_large)
  263. << 1;
  264. ResultChar &= ~0U >> (32 - CharWidth);
  265. }
  266. break;
  267. }
  268. // Otherwise, these are not valid escapes.
  269. case '(': case '{': case '[': case '%':
  270. // GCC accepts these as extensions. We warn about them as such though.
  271. if (Diags)
  272. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  273. diag::ext_nonstandard_escape)
  274. << std::string(1, ResultChar);
  275. break;
  276. default:
  277. if (!Diags)
  278. break;
  279. if (isPrintable(ResultChar))
  280. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  281. diag::ext_unknown_escape)
  282. << std::string(1, ResultChar);
  283. else
  284. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  285. diag::ext_unknown_escape)
  286. << "x" + llvm::utohexstr(ResultChar);
  287. break;
  288. }
  289. if (Delimited && Diags) {
  290. if (!EndDelimiterFound)
  291. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  292. diag::err_expected)
  293. << tok::r_brace;
  294. else if (!HadError) {
  295. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  296. Features.CPlusPlus2b ? diag::warn_cxx2b_delimited_escape_sequence
  297. : diag::ext_delimited_escape_sequence)
  298. << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
  299. }
  300. }
  301. return ResultChar;
  302. }
  303. static void appendCodePoint(unsigned Codepoint,
  304. llvm::SmallVectorImpl<char> &Str) {
  305. char ResultBuf[4];
  306. char *ResultPtr = ResultBuf;
  307. if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
  308. Str.append(ResultBuf, ResultPtr);
  309. }
  310. void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
  311. for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
  312. if (*I != '\\') {
  313. Buf.push_back(*I);
  314. continue;
  315. }
  316. ++I;
  317. char Kind = *I;
  318. ++I;
  319. assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
  320. uint32_t CodePoint = 0;
  321. if (Kind == 'u' && *I == '{') {
  322. for (++I; *I != '}'; ++I) {
  323. unsigned Value = llvm::hexDigitValue(*I);
  324. assert(Value != -1U);
  325. CodePoint <<= 4;
  326. CodePoint += Value;
  327. }
  328. appendCodePoint(CodePoint, Buf);
  329. continue;
  330. }
  331. if (Kind == 'N') {
  332. assert(*I == '{');
  333. ++I;
  334. auto Delim = std::find(I, Input.end(), '}');
  335. assert(Delim != Input.end());
  336. std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
  337. llvm::sys::unicode::nameToCodepointLooseMatching(
  338. StringRef(I, std::distance(I, Delim)));
  339. assert(Res);
  340. CodePoint = Res->CodePoint;
  341. assert(CodePoint != 0xFFFFFFFF);
  342. appendCodePoint(CodePoint, Buf);
  343. I = Delim;
  344. continue;
  345. }
  346. unsigned NumHexDigits;
  347. if (Kind == 'u')
  348. NumHexDigits = 4;
  349. else
  350. NumHexDigits = 8;
  351. assert(I + NumHexDigits <= E);
  352. for (; NumHexDigits != 0; ++I, --NumHexDigits) {
  353. unsigned Value = llvm::hexDigitValue(*I);
  354. assert(Value != -1U);
  355. CodePoint <<= 4;
  356. CodePoint += Value;
  357. }
  358. appendCodePoint(CodePoint, Buf);
  359. --I;
  360. }
  361. }
  362. static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
  363. const char *&ThisTokBuf,
  364. const char *ThisTokEnd, uint32_t &UcnVal,
  365. unsigned short &UcnLen, bool &Delimited,
  366. FullSourceLoc Loc, DiagnosticsEngine *Diags,
  367. const LangOptions &Features,
  368. bool in_char_string_literal = false) {
  369. const char *UcnBegin = ThisTokBuf;
  370. bool HasError = false;
  371. bool EndDelimiterFound = false;
  372. // Skip the '\u' char's.
  373. ThisTokBuf += 2;
  374. Delimited = false;
  375. if (UcnBegin[1] == 'u' && in_char_string_literal &&
  376. ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
  377. Delimited = true;
  378. ThisTokBuf++;
  379. } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
  380. if (Diags)
  381. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  382. diag::err_hex_escape_no_digits)
  383. << StringRef(&ThisTokBuf[-1], 1);
  384. return false;
  385. }
  386. UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
  387. bool Overflow = false;
  388. unsigned short Count = 0;
  389. for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
  390. ++ThisTokBuf) {
  391. if (Delimited && *ThisTokBuf == '}') {
  392. ++ThisTokBuf;
  393. EndDelimiterFound = true;
  394. break;
  395. }
  396. int CharVal = llvm::hexDigitValue(*ThisTokBuf);
  397. if (CharVal == -1) {
  398. HasError = true;
  399. if (!Delimited)
  400. break;
  401. if (Diags) {
  402. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  403. diag::err_delimited_escape_invalid)
  404. << StringRef(ThisTokBuf, 1);
  405. }
  406. Count++;
  407. continue;
  408. }
  409. if (UcnVal & 0xF0000000) {
  410. Overflow = true;
  411. continue;
  412. }
  413. UcnVal <<= 4;
  414. UcnVal |= CharVal;
  415. Count++;
  416. }
  417. if (Overflow) {
  418. if (Diags)
  419. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  420. diag::err_escape_too_large)
  421. << 0;
  422. return false;
  423. }
  424. if (Delimited && !EndDelimiterFound) {
  425. if (Diags) {
  426. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  427. diag::err_expected)
  428. << tok::r_brace;
  429. }
  430. return false;
  431. }
  432. // If we didn't consume the proper number of digits, there is a problem.
  433. if (Count == 0 || (!Delimited && Count != UcnLen)) {
  434. if (Diags)
  435. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  436. Delimited ? diag::err_delimited_escape_empty
  437. : diag::err_ucn_escape_incomplete);
  438. return false;
  439. }
  440. return !HasError;
  441. }
  442. static void DiagnoseInvalidUnicodeCharacterName(
  443. DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
  444. const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
  445. llvm::StringRef Name) {
  446. Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
  447. diag::err_invalid_ucn_name)
  448. << Name;
  449. namespace u = llvm::sys::unicode;
  450. std::optional<u::LooseMatchingResult> Res =
  451. u::nameToCodepointLooseMatching(Name);
  452. if (Res) {
  453. Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
  454. diag::note_invalid_ucn_name_loose_matching)
  455. << FixItHint::CreateReplacement(
  456. MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
  457. TokRangeEnd),
  458. Res->Name);
  459. return;
  460. }
  461. unsigned Distance = 0;
  462. SmallVector<u::MatchForCodepointName> Matches =
  463. u::nearestMatchesForCodepointName(Name, 5);
  464. assert(!Matches.empty() && "No unicode characters found");
  465. for (const auto &Match : Matches) {
  466. if (Distance == 0)
  467. Distance = Match.Distance;
  468. if (std::max(Distance, Match.Distance) -
  469. std::min(Distance, Match.Distance) >
  470. 3)
  471. break;
  472. Distance = Match.Distance;
  473. std::string Str;
  474. llvm::UTF32 V = Match.Value;
  475. bool Converted =
  476. llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
  477. (void)Converted;
  478. assert(Converted && "Found a match wich is not a unicode character");
  479. Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
  480. diag::note_invalid_ucn_name_candidate)
  481. << Match.Name << llvm::utohexstr(Match.Value)
  482. << Str // FIXME: Fix the rendering of non printable characters
  483. << FixItHint::CreateReplacement(
  484. MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
  485. TokRangeEnd),
  486. Match.Name);
  487. }
  488. }
  489. static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
  490. const char *&ThisTokBuf,
  491. const char *ThisTokEnd, uint32_t &UcnVal,
  492. unsigned short &UcnLen, FullSourceLoc Loc,
  493. DiagnosticsEngine *Diags,
  494. const LangOptions &Features) {
  495. const char *UcnBegin = ThisTokBuf;
  496. assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
  497. ThisTokBuf += 2;
  498. if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
  499. if (Diags) {
  500. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  501. diag::err_delimited_escape_missing_brace)
  502. << StringRef(&ThisTokBuf[-1], 1);
  503. }
  504. return false;
  505. }
  506. ThisTokBuf++;
  507. const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
  508. return C == '}' || isVerticalWhitespace(C);
  509. });
  510. bool Incomplete = ClosingBrace == ThisTokEnd;
  511. bool Empty = ClosingBrace == ThisTokBuf;
  512. if (Incomplete || Empty) {
  513. if (Diags) {
  514. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  515. Incomplete ? diag::err_ucn_escape_incomplete
  516. : diag::err_delimited_escape_empty)
  517. << StringRef(&UcnBegin[1], 1);
  518. }
  519. ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
  520. return false;
  521. }
  522. StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
  523. ThisTokBuf = ClosingBrace + 1;
  524. std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
  525. if (!Res) {
  526. if (Diags)
  527. DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
  528. &UcnBegin[3], ClosingBrace, Name);
  529. return false;
  530. }
  531. UcnVal = *Res;
  532. UcnLen = UcnVal > 0xFFFF ? 8 : 4;
  533. return true;
  534. }
  535. /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
  536. /// return the UTF32.
  537. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  538. const char *ThisTokEnd, uint32_t &UcnVal,
  539. unsigned short &UcnLen, FullSourceLoc Loc,
  540. DiagnosticsEngine *Diags,
  541. const LangOptions &Features,
  542. bool in_char_string_literal = false) {
  543. bool HasError;
  544. const char *UcnBegin = ThisTokBuf;
  545. bool IsDelimitedEscapeSequence = false;
  546. bool IsNamedEscapeSequence = false;
  547. if (ThisTokBuf[1] == 'N') {
  548. IsNamedEscapeSequence = true;
  549. HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
  550. UcnVal, UcnLen, Loc, Diags, Features);
  551. } else {
  552. HasError =
  553. !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
  554. UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
  555. Features, in_char_string_literal);
  556. }
  557. if (HasError)
  558. return false;
  559. // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
  560. if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
  561. UcnVal > 0x10FFFF) { // maximum legal UTF32 value
  562. if (Diags)
  563. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  564. diag::err_ucn_escape_invalid);
  565. return false;
  566. }
  567. // C++11 allows UCNs that refer to control characters and basic source
  568. // characters inside character and string literals
  569. if (UcnVal < 0xa0 &&
  570. (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
  571. bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
  572. if (Diags) {
  573. char BasicSCSChar = UcnVal;
  574. if (UcnVal >= 0x20 && UcnVal < 0x7f)
  575. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  576. IsError ? diag::err_ucn_escape_basic_scs :
  577. diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
  578. << StringRef(&BasicSCSChar, 1);
  579. else
  580. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  581. IsError ? diag::err_ucn_control_character :
  582. diag::warn_cxx98_compat_literal_ucn_control_character);
  583. }
  584. if (IsError)
  585. return false;
  586. }
  587. if (!Features.CPlusPlus && !Features.C99 && Diags)
  588. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  589. diag::warn_ucn_not_valid_in_c89_literal);
  590. if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
  591. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  592. Features.CPlusPlus2b ? diag::warn_cxx2b_delimited_escape_sequence
  593. : diag::ext_delimited_escape_sequence)
  594. << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
  595. return true;
  596. }
  597. /// MeasureUCNEscape - Determine the number of bytes within the resulting string
  598. /// which this UCN will occupy.
  599. static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  600. const char *ThisTokEnd, unsigned CharByteWidth,
  601. const LangOptions &Features, bool &HadError) {
  602. // UTF-32: 4 bytes per escape.
  603. if (CharByteWidth == 4)
  604. return 4;
  605. uint32_t UcnVal = 0;
  606. unsigned short UcnLen = 0;
  607. FullSourceLoc Loc;
  608. if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
  609. UcnLen, Loc, nullptr, Features, true)) {
  610. HadError = true;
  611. return 0;
  612. }
  613. // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
  614. if (CharByteWidth == 2)
  615. return UcnVal <= 0xFFFF ? 2 : 4;
  616. // UTF-8.
  617. if (UcnVal < 0x80)
  618. return 1;
  619. if (UcnVal < 0x800)
  620. return 2;
  621. if (UcnVal < 0x10000)
  622. return 3;
  623. return 4;
  624. }
  625. /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
  626. /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
  627. /// StringLiteralParser. When we decide to implement UCN's for identifiers,
  628. /// we will likely rework our support for UCN's.
  629. static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  630. const char *ThisTokEnd,
  631. char *&ResultBuf, bool &HadError,
  632. FullSourceLoc Loc, unsigned CharByteWidth,
  633. DiagnosticsEngine *Diags,
  634. const LangOptions &Features) {
  635. typedef uint32_t UTF32;
  636. UTF32 UcnVal = 0;
  637. unsigned short UcnLen = 0;
  638. if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
  639. Loc, Diags, Features, true)) {
  640. HadError = true;
  641. return;
  642. }
  643. assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
  644. "only character widths of 1, 2, or 4 bytes supported");
  645. (void)UcnLen;
  646. assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
  647. if (CharByteWidth == 4) {
  648. // FIXME: Make the type of the result buffer correct instead of
  649. // using reinterpret_cast.
  650. llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
  651. *ResultPtr = UcnVal;
  652. ResultBuf += 4;
  653. return;
  654. }
  655. if (CharByteWidth == 2) {
  656. // FIXME: Make the type of the result buffer correct instead of
  657. // using reinterpret_cast.
  658. llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
  659. if (UcnVal <= (UTF32)0xFFFF) {
  660. *ResultPtr = UcnVal;
  661. ResultBuf += 2;
  662. return;
  663. }
  664. // Convert to UTF16.
  665. UcnVal -= 0x10000;
  666. *ResultPtr = 0xD800 + (UcnVal >> 10);
  667. *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
  668. ResultBuf += 4;
  669. return;
  670. }
  671. assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
  672. // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
  673. // The conversion below was inspired by:
  674. // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
  675. // First, we determine how many bytes the result will require.
  676. typedef uint8_t UTF8;
  677. unsigned short bytesToWrite = 0;
  678. if (UcnVal < (UTF32)0x80)
  679. bytesToWrite = 1;
  680. else if (UcnVal < (UTF32)0x800)
  681. bytesToWrite = 2;
  682. else if (UcnVal < (UTF32)0x10000)
  683. bytesToWrite = 3;
  684. else
  685. bytesToWrite = 4;
  686. const unsigned byteMask = 0xBF;
  687. const unsigned byteMark = 0x80;
  688. // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
  689. // into the first byte, depending on how many bytes follow.
  690. static const UTF8 firstByteMark[5] = {
  691. 0x00, 0x00, 0xC0, 0xE0, 0xF0
  692. };
  693. // Finally, we write the bytes into ResultBuf.
  694. ResultBuf += bytesToWrite;
  695. switch (bytesToWrite) { // note: everything falls through.
  696. case 4:
  697. *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  698. [[fallthrough]];
  699. case 3:
  700. *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  701. [[fallthrough]];
  702. case 2:
  703. *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  704. [[fallthrough]];
  705. case 1:
  706. *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
  707. }
  708. // Update the buffer.
  709. ResultBuf += bytesToWrite;
  710. }
  711. /// integer-constant: [C99 6.4.4.1]
  712. /// decimal-constant integer-suffix
  713. /// octal-constant integer-suffix
  714. /// hexadecimal-constant integer-suffix
  715. /// binary-literal integer-suffix [GNU, C++1y]
  716. /// user-defined-integer-literal: [C++11 lex.ext]
  717. /// decimal-literal ud-suffix
  718. /// octal-literal ud-suffix
  719. /// hexadecimal-literal ud-suffix
  720. /// binary-literal ud-suffix [GNU, C++1y]
  721. /// decimal-constant:
  722. /// nonzero-digit
  723. /// decimal-constant digit
  724. /// octal-constant:
  725. /// 0
  726. /// octal-constant octal-digit
  727. /// hexadecimal-constant:
  728. /// hexadecimal-prefix hexadecimal-digit
  729. /// hexadecimal-constant hexadecimal-digit
  730. /// hexadecimal-prefix: one of
  731. /// 0x 0X
  732. /// binary-literal:
  733. /// 0b binary-digit
  734. /// 0B binary-digit
  735. /// binary-literal binary-digit
  736. /// integer-suffix:
  737. /// unsigned-suffix [long-suffix]
  738. /// unsigned-suffix [long-long-suffix]
  739. /// long-suffix [unsigned-suffix]
  740. /// long-long-suffix [unsigned-sufix]
  741. /// nonzero-digit:
  742. /// 1 2 3 4 5 6 7 8 9
  743. /// octal-digit:
  744. /// 0 1 2 3 4 5 6 7
  745. /// hexadecimal-digit:
  746. /// 0 1 2 3 4 5 6 7 8 9
  747. /// a b c d e f
  748. /// A B C D E F
  749. /// binary-digit:
  750. /// 0
  751. /// 1
  752. /// unsigned-suffix: one of
  753. /// u U
  754. /// long-suffix: one of
  755. /// l L
  756. /// long-long-suffix: one of
  757. /// ll LL
  758. ///
  759. /// floating-constant: [C99 6.4.4.2]
  760. /// TODO: add rules...
  761. ///
  762. NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
  763. SourceLocation TokLoc,
  764. const SourceManager &SM,
  765. const LangOptions &LangOpts,
  766. const TargetInfo &Target,
  767. DiagnosticsEngine &Diags)
  768. : SM(SM), LangOpts(LangOpts), Diags(Diags),
  769. ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
  770. s = DigitsBegin = ThisTokBegin;
  771. saw_exponent = false;
  772. saw_period = false;
  773. saw_ud_suffix = false;
  774. saw_fixed_point_suffix = false;
  775. isLong = false;
  776. isUnsigned = false;
  777. isLongLong = false;
  778. isSizeT = false;
  779. isHalf = false;
  780. isFloat = false;
  781. isImaginary = false;
  782. isFloat16 = false;
  783. isFloat128 = false;
  784. MicrosoftInteger = 0;
  785. isFract = false;
  786. isAccum = false;
  787. hadError = false;
  788. isBitInt = false;
  789. // This routine assumes that the range begin/end matches the regex for integer
  790. // and FP constants (specifically, the 'pp-number' regex), and assumes that
  791. // the byte at "*end" is both valid and not part of the regex. Because of
  792. // this, it doesn't have to check for 'overscan' in various places.
  793. if (isPreprocessingNumberBody(*ThisTokEnd)) {
  794. Diags.Report(TokLoc, diag::err_lexing_numeric);
  795. hadError = true;
  796. return;
  797. }
  798. if (*s == '0') { // parse radix
  799. ParseNumberStartingWithZero(TokLoc);
  800. if (hadError)
  801. return;
  802. } else { // the first digit is non-zero
  803. radix = 10;
  804. s = SkipDigits(s);
  805. if (s == ThisTokEnd) {
  806. // Done.
  807. } else {
  808. ParseDecimalOrOctalCommon(TokLoc);
  809. if (hadError)
  810. return;
  811. }
  812. }
  813. SuffixBegin = s;
  814. checkSeparator(TokLoc, s, CSK_AfterDigits);
  815. // Initial scan to lookahead for fixed point suffix.
  816. if (LangOpts.FixedPoint) {
  817. for (const char *c = s; c != ThisTokEnd; ++c) {
  818. if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
  819. saw_fixed_point_suffix = true;
  820. break;
  821. }
  822. }
  823. }
  824. // Parse the suffix. At this point we can classify whether we have an FP or
  825. // integer constant.
  826. bool isFixedPointConstant = isFixedPointLiteral();
  827. bool isFPConstant = isFloatingLiteral();
  828. bool HasSize = false;
  829. // Loop over all of the characters of the suffix. If we see something bad,
  830. // we break out of the loop.
  831. for (; s != ThisTokEnd; ++s) {
  832. switch (*s) {
  833. case 'R':
  834. case 'r':
  835. if (!LangOpts.FixedPoint)
  836. break;
  837. if (isFract || isAccum) break;
  838. if (!(saw_period || saw_exponent)) break;
  839. isFract = true;
  840. continue;
  841. case 'K':
  842. case 'k':
  843. if (!LangOpts.FixedPoint)
  844. break;
  845. if (isFract || isAccum) break;
  846. if (!(saw_period || saw_exponent)) break;
  847. isAccum = true;
  848. continue;
  849. case 'h': // FP Suffix for "half".
  850. case 'H':
  851. // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
  852. if (!(LangOpts.Half || LangOpts.FixedPoint))
  853. break;
  854. if (isIntegerLiteral()) break; // Error for integer constant.
  855. if (HasSize)
  856. break;
  857. HasSize = true;
  858. isHalf = true;
  859. continue; // Success.
  860. case 'f': // FP Suffix for "float"
  861. case 'F':
  862. if (!isFPConstant) break; // Error for integer constant.
  863. if (HasSize)
  864. break;
  865. HasSize = true;
  866. // CUDA host and device may have different _Float16 support, therefore
  867. // allows f16 literals to avoid false alarm.
  868. // When we compile for OpenMP target offloading on NVPTX, f16 suffix
  869. // should also be supported.
  870. // ToDo: more precise check for CUDA.
  871. // TODO: AMDGPU might also support it in the future.
  872. if ((Target.hasFloat16Type() || LangOpts.CUDA ||
  873. (LangOpts.OpenMPIsDevice && Target.getTriple().isNVPTX())) &&
  874. s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
  875. s += 2; // success, eat up 2 characters.
  876. isFloat16 = true;
  877. continue;
  878. }
  879. isFloat = true;
  880. continue; // Success.
  881. case 'q': // FP Suffix for "__float128"
  882. case 'Q':
  883. if (!isFPConstant) break; // Error for integer constant.
  884. if (HasSize)
  885. break;
  886. HasSize = true;
  887. isFloat128 = true;
  888. continue; // Success.
  889. case 'u':
  890. case 'U':
  891. if (isFPConstant) break; // Error for floating constant.
  892. if (isUnsigned) break; // Cannot be repeated.
  893. isUnsigned = true;
  894. continue; // Success.
  895. case 'l':
  896. case 'L':
  897. if (HasSize)
  898. break;
  899. HasSize = true;
  900. // Check for long long. The L's need to be adjacent and the same case.
  901. if (s[1] == s[0]) {
  902. assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
  903. if (isFPConstant) break; // long long invalid for floats.
  904. isLongLong = true;
  905. ++s; // Eat both of them.
  906. } else {
  907. isLong = true;
  908. }
  909. continue; // Success.
  910. case 'z':
  911. case 'Z':
  912. if (isFPConstant)
  913. break; // Invalid for floats.
  914. if (HasSize)
  915. break;
  916. HasSize = true;
  917. isSizeT = true;
  918. continue;
  919. case 'i':
  920. case 'I':
  921. if (LangOpts.MicrosoftExt && !isFPConstant) {
  922. // Allow i8, i16, i32, and i64. First, look ahead and check if
  923. // suffixes are Microsoft integers and not the imaginary unit.
  924. uint8_t Bits = 0;
  925. size_t ToSkip = 0;
  926. switch (s[1]) {
  927. case '8': // i8 suffix
  928. Bits = 8;
  929. ToSkip = 2;
  930. break;
  931. case '1':
  932. if (s[2] == '6') { // i16 suffix
  933. Bits = 16;
  934. ToSkip = 3;
  935. }
  936. break;
  937. case '3':
  938. if (s[2] == '2') { // i32 suffix
  939. Bits = 32;
  940. ToSkip = 3;
  941. }
  942. break;
  943. case '6':
  944. if (s[2] == '4') { // i64 suffix
  945. Bits = 64;
  946. ToSkip = 3;
  947. }
  948. break;
  949. default:
  950. break;
  951. }
  952. if (Bits) {
  953. if (HasSize)
  954. break;
  955. HasSize = true;
  956. MicrosoftInteger = Bits;
  957. s += ToSkip;
  958. assert(s <= ThisTokEnd && "didn't maximally munch?");
  959. break;
  960. }
  961. }
  962. [[fallthrough]];
  963. case 'j':
  964. case 'J':
  965. if (isImaginary) break; // Cannot be repeated.
  966. isImaginary = true;
  967. continue; // Success.
  968. case 'w':
  969. case 'W':
  970. if (isFPConstant)
  971. break; // Invalid for floats.
  972. if (HasSize)
  973. break; // Invalid if we already have a size for the literal.
  974. // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
  975. // explicitly do not support the suffix in C++ as an extension because a
  976. // library-based UDL that resolves to a library type may be more
  977. // appropriate there.
  978. if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
  979. (s[0] == 'W' && s[1] == 'B'))) {
  980. isBitInt = true;
  981. HasSize = true;
  982. ++s; // Skip both characters (2nd char skipped on continue).
  983. continue; // Success.
  984. }
  985. }
  986. // If we reached here, there was an error or a ud-suffix.
  987. break;
  988. }
  989. // "i", "if", and "il" are user-defined suffixes in C++1y.
  990. if (s != ThisTokEnd || isImaginary) {
  991. // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
  992. expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
  993. if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
  994. if (!isImaginary) {
  995. // Any suffix pieces we might have parsed are actually part of the
  996. // ud-suffix.
  997. isLong = false;
  998. isUnsigned = false;
  999. isLongLong = false;
  1000. isSizeT = false;
  1001. isFloat = false;
  1002. isFloat16 = false;
  1003. isHalf = false;
  1004. isImaginary = false;
  1005. isBitInt = false;
  1006. MicrosoftInteger = 0;
  1007. saw_fixed_point_suffix = false;
  1008. isFract = false;
  1009. isAccum = false;
  1010. }
  1011. saw_ud_suffix = true;
  1012. return;
  1013. }
  1014. if (s != ThisTokEnd) {
  1015. // Report an error if there are any.
  1016. Diags.Report(Lexer::AdvanceToTokenCharacter(
  1017. TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
  1018. diag::err_invalid_suffix_constant)
  1019. << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
  1020. << (isFixedPointConstant ? 2 : isFPConstant);
  1021. hadError = true;
  1022. }
  1023. }
  1024. if (!hadError && saw_fixed_point_suffix) {
  1025. assert(isFract || isAccum);
  1026. }
  1027. }
  1028. /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
  1029. /// numbers. It issues an error for illegal digits, and handles floating point
  1030. /// parsing. If it detects a floating point number, the radix is set to 10.
  1031. void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
  1032. assert((radix == 8 || radix == 10) && "Unexpected radix");
  1033. // If we have a hex digit other than 'e' (which denotes a FP exponent) then
  1034. // the code is using an incorrect base.
  1035. if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
  1036. !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
  1037. Diags.Report(
  1038. Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
  1039. diag::err_invalid_digit)
  1040. << StringRef(s, 1) << (radix == 8 ? 1 : 0);
  1041. hadError = true;
  1042. return;
  1043. }
  1044. if (*s == '.') {
  1045. checkSeparator(TokLoc, s, CSK_AfterDigits);
  1046. s++;
  1047. radix = 10;
  1048. saw_period = true;
  1049. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  1050. s = SkipDigits(s); // Skip suffix.
  1051. }
  1052. if (*s == 'e' || *s == 'E') { // exponent
  1053. checkSeparator(TokLoc, s, CSK_AfterDigits);
  1054. const char *Exponent = s;
  1055. s++;
  1056. radix = 10;
  1057. saw_exponent = true;
  1058. if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
  1059. const char *first_non_digit = SkipDigits(s);
  1060. if (containsDigits(s, first_non_digit)) {
  1061. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  1062. s = first_non_digit;
  1063. } else {
  1064. if (!hadError) {
  1065. Diags.Report(Lexer::AdvanceToTokenCharacter(
  1066. TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
  1067. diag::err_exponent_has_no_digits);
  1068. hadError = true;
  1069. }
  1070. return;
  1071. }
  1072. }
  1073. }
  1074. /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
  1075. /// suffixes as ud-suffixes, because the diagnostic experience is better if we
  1076. /// treat it as an invalid suffix.
  1077. bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
  1078. StringRef Suffix) {
  1079. if (!LangOpts.CPlusPlus11 || Suffix.empty())
  1080. return false;
  1081. // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
  1082. if (Suffix[0] == '_')
  1083. return true;
  1084. // In C++11, there are no library suffixes.
  1085. if (!LangOpts.CPlusPlus14)
  1086. return false;
  1087. // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
  1088. // Per tweaked N3660, "il", "i", and "if" are also used in the library.
  1089. // In C++2a "d" and "y" are used in the library.
  1090. return llvm::StringSwitch<bool>(Suffix)
  1091. .Cases("h", "min", "s", true)
  1092. .Cases("ms", "us", "ns", true)
  1093. .Cases("il", "i", "if", true)
  1094. .Cases("d", "y", LangOpts.CPlusPlus20)
  1095. .Default(false);
  1096. }
  1097. void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
  1098. const char *Pos,
  1099. CheckSeparatorKind IsAfterDigits) {
  1100. if (IsAfterDigits == CSK_AfterDigits) {
  1101. if (Pos == ThisTokBegin)
  1102. return;
  1103. --Pos;
  1104. } else if (Pos == ThisTokEnd)
  1105. return;
  1106. if (isDigitSeparator(*Pos)) {
  1107. Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
  1108. LangOpts),
  1109. diag::err_digit_separator_not_between_digits)
  1110. << IsAfterDigits;
  1111. hadError = true;
  1112. }
  1113. }
  1114. /// ParseNumberStartingWithZero - This method is called when the first character
  1115. /// of the number is found to be a zero. This means it is either an octal
  1116. /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
  1117. /// a floating point number (01239.123e4). Eat the prefix, determining the
  1118. /// radix etc.
  1119. void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
  1120. assert(s[0] == '0' && "Invalid method call");
  1121. s++;
  1122. int c1 = s[0];
  1123. // Handle a hex number like 0x1234.
  1124. if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
  1125. s++;
  1126. assert(s < ThisTokEnd && "didn't maximally munch?");
  1127. radix = 16;
  1128. DigitsBegin = s;
  1129. s = SkipHexDigits(s);
  1130. bool HasSignificandDigits = containsDigits(DigitsBegin, s);
  1131. if (s == ThisTokEnd) {
  1132. // Done.
  1133. } else if (*s == '.') {
  1134. s++;
  1135. saw_period = true;
  1136. const char *floatDigitsBegin = s;
  1137. s = SkipHexDigits(s);
  1138. if (containsDigits(floatDigitsBegin, s))
  1139. HasSignificandDigits = true;
  1140. if (HasSignificandDigits)
  1141. checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
  1142. }
  1143. if (!HasSignificandDigits) {
  1144. Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
  1145. LangOpts),
  1146. diag::err_hex_constant_requires)
  1147. << LangOpts.CPlusPlus << 1;
  1148. hadError = true;
  1149. return;
  1150. }
  1151. // A binary exponent can appear with or with a '.'. If dotted, the
  1152. // binary exponent is required.
  1153. if (*s == 'p' || *s == 'P') {
  1154. checkSeparator(TokLoc, s, CSK_AfterDigits);
  1155. const char *Exponent = s;
  1156. s++;
  1157. saw_exponent = true;
  1158. if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
  1159. const char *first_non_digit = SkipDigits(s);
  1160. if (!containsDigits(s, first_non_digit)) {
  1161. if (!hadError) {
  1162. Diags.Report(Lexer::AdvanceToTokenCharacter(
  1163. TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
  1164. diag::err_exponent_has_no_digits);
  1165. hadError = true;
  1166. }
  1167. return;
  1168. }
  1169. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  1170. s = first_non_digit;
  1171. if (!LangOpts.HexFloats)
  1172. Diags.Report(TokLoc, LangOpts.CPlusPlus
  1173. ? diag::ext_hex_literal_invalid
  1174. : diag::ext_hex_constant_invalid);
  1175. else if (LangOpts.CPlusPlus17)
  1176. Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
  1177. } else if (saw_period) {
  1178. Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
  1179. LangOpts),
  1180. diag::err_hex_constant_requires)
  1181. << LangOpts.CPlusPlus << 0;
  1182. hadError = true;
  1183. }
  1184. return;
  1185. }
  1186. // Handle simple binary numbers 0b01010
  1187. if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
  1188. // 0b101010 is a C++1y / GCC extension.
  1189. Diags.Report(TokLoc, LangOpts.CPlusPlus14
  1190. ? diag::warn_cxx11_compat_binary_literal
  1191. : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
  1192. : diag::ext_binary_literal);
  1193. ++s;
  1194. assert(s < ThisTokEnd && "didn't maximally munch?");
  1195. radix = 2;
  1196. DigitsBegin = s;
  1197. s = SkipBinaryDigits(s);
  1198. if (s == ThisTokEnd) {
  1199. // Done.
  1200. } else if (isHexDigit(*s) &&
  1201. !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
  1202. Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
  1203. LangOpts),
  1204. diag::err_invalid_digit)
  1205. << StringRef(s, 1) << 2;
  1206. hadError = true;
  1207. }
  1208. // Other suffixes will be diagnosed by the caller.
  1209. return;
  1210. }
  1211. // For now, the radix is set to 8. If we discover that we have a
  1212. // floating point constant, the radix will change to 10. Octal floating
  1213. // point constants are not permitted (only decimal and hexadecimal).
  1214. radix = 8;
  1215. const char *PossibleNewDigitStart = s;
  1216. s = SkipOctalDigits(s);
  1217. // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
  1218. // as the start of the digits. So if skipping octal digits does not skip
  1219. // anything, we leave the digit start where it was.
  1220. if (s != PossibleNewDigitStart)
  1221. DigitsBegin = PossibleNewDigitStart;
  1222. if (s == ThisTokEnd)
  1223. return; // Done, simple octal number like 01234
  1224. // If we have some other non-octal digit that *is* a decimal digit, see if
  1225. // this is part of a floating point number like 094.123 or 09e1.
  1226. if (isDigit(*s)) {
  1227. const char *EndDecimal = SkipDigits(s);
  1228. if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
  1229. s = EndDecimal;
  1230. radix = 10;
  1231. }
  1232. }
  1233. ParseDecimalOrOctalCommon(TokLoc);
  1234. }
  1235. static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
  1236. switch (Radix) {
  1237. case 2:
  1238. return NumDigits <= 64;
  1239. case 8:
  1240. return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
  1241. case 10:
  1242. return NumDigits <= 19; // floor(log10(2^64))
  1243. case 16:
  1244. return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
  1245. default:
  1246. llvm_unreachable("impossible Radix");
  1247. }
  1248. }
  1249. /// GetIntegerValue - Convert this numeric literal value to an APInt that
  1250. /// matches Val's input width. If there is an overflow, set Val to the low bits
  1251. /// of the result and return true. Otherwise, return false.
  1252. bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
  1253. // Fast path: Compute a conservative bound on the maximum number of
  1254. // bits per digit in this radix. If we can't possibly overflow a
  1255. // uint64 based on that bound then do the simple conversion to
  1256. // integer. This avoids the expensive overflow checking below, and
  1257. // handles the common cases that matter (small decimal integers and
  1258. // hex/octal values which don't overflow).
  1259. const unsigned NumDigits = SuffixBegin - DigitsBegin;
  1260. if (alwaysFitsInto64Bits(radix, NumDigits)) {
  1261. uint64_t N = 0;
  1262. for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
  1263. if (!isDigitSeparator(*Ptr))
  1264. N = N * radix + llvm::hexDigitValue(*Ptr);
  1265. // This will truncate the value to Val's input width. Simply check
  1266. // for overflow by comparing.
  1267. Val = N;
  1268. return Val.getZExtValue() != N;
  1269. }
  1270. Val = 0;
  1271. const char *Ptr = DigitsBegin;
  1272. llvm::APInt RadixVal(Val.getBitWidth(), radix);
  1273. llvm::APInt CharVal(Val.getBitWidth(), 0);
  1274. llvm::APInt OldVal = Val;
  1275. bool OverflowOccurred = false;
  1276. while (Ptr < SuffixBegin) {
  1277. if (isDigitSeparator(*Ptr)) {
  1278. ++Ptr;
  1279. continue;
  1280. }
  1281. unsigned C = llvm::hexDigitValue(*Ptr++);
  1282. // If this letter is out of bound for this radix, reject it.
  1283. assert(C < radix && "NumericLiteralParser ctor should have rejected this");
  1284. CharVal = C;
  1285. // Add the digit to the value in the appropriate radix. If adding in digits
  1286. // made the value smaller, then this overflowed.
  1287. OldVal = Val;
  1288. // Multiply by radix, did overflow occur on the multiply?
  1289. Val *= RadixVal;
  1290. OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
  1291. // Add value, did overflow occur on the value?
  1292. // (a + b) ult b <=> overflow
  1293. Val += CharVal;
  1294. OverflowOccurred |= Val.ult(CharVal);
  1295. }
  1296. return OverflowOccurred;
  1297. }
  1298. llvm::APFloat::opStatus
  1299. NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
  1300. using llvm::APFloat;
  1301. unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
  1302. llvm::SmallString<16> Buffer;
  1303. StringRef Str(ThisTokBegin, n);
  1304. if (Str.contains('\'')) {
  1305. Buffer.reserve(n);
  1306. std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
  1307. &isDigitSeparator);
  1308. Str = Buffer;
  1309. }
  1310. auto StatusOrErr =
  1311. Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
  1312. assert(StatusOrErr && "Invalid floating point representation");
  1313. return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
  1314. : APFloat::opInvalidOp;
  1315. }
  1316. static inline bool IsExponentPart(char c) {
  1317. return c == 'p' || c == 'P' || c == 'e' || c == 'E';
  1318. }
  1319. bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
  1320. assert(radix == 16 || radix == 10);
  1321. // Find how many digits are needed to store the whole literal.
  1322. unsigned NumDigits = SuffixBegin - DigitsBegin;
  1323. if (saw_period) --NumDigits;
  1324. // Initial scan of the exponent if it exists
  1325. bool ExpOverflowOccurred = false;
  1326. bool NegativeExponent = false;
  1327. const char *ExponentBegin;
  1328. uint64_t Exponent = 0;
  1329. int64_t BaseShift = 0;
  1330. if (saw_exponent) {
  1331. const char *Ptr = DigitsBegin;
  1332. while (!IsExponentPart(*Ptr)) ++Ptr;
  1333. ExponentBegin = Ptr;
  1334. ++Ptr;
  1335. NegativeExponent = *Ptr == '-';
  1336. if (NegativeExponent) ++Ptr;
  1337. unsigned NumExpDigits = SuffixBegin - Ptr;
  1338. if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
  1339. llvm::StringRef ExpStr(Ptr, NumExpDigits);
  1340. llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
  1341. Exponent = ExpInt.getZExtValue();
  1342. } else {
  1343. ExpOverflowOccurred = true;
  1344. }
  1345. if (NegativeExponent) BaseShift -= Exponent;
  1346. else BaseShift += Exponent;
  1347. }
  1348. // Number of bits needed for decimal literal is
  1349. // ceil(NumDigits * log2(10)) Integral part
  1350. // + Scale Fractional part
  1351. // + ceil(Exponent * log2(10)) Exponent
  1352. // --------------------------------------------------
  1353. // ceil((NumDigits + Exponent) * log2(10)) + Scale
  1354. //
  1355. // But for simplicity in handling integers, we can round up log2(10) to 4,
  1356. // making:
  1357. // 4 * (NumDigits + Exponent) + Scale
  1358. //
  1359. // Number of digits needed for hexadecimal literal is
  1360. // 4 * NumDigits Integral part
  1361. // + Scale Fractional part
  1362. // + Exponent Exponent
  1363. // --------------------------------------------------
  1364. // (4 * NumDigits) + Scale + Exponent
  1365. uint64_t NumBitsNeeded;
  1366. if (radix == 10)
  1367. NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
  1368. else
  1369. NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
  1370. if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
  1371. ExpOverflowOccurred = true;
  1372. llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
  1373. bool FoundDecimal = false;
  1374. int64_t FractBaseShift = 0;
  1375. const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
  1376. for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
  1377. if (*Ptr == '.') {
  1378. FoundDecimal = true;
  1379. continue;
  1380. }
  1381. // Normal reading of an integer
  1382. unsigned C = llvm::hexDigitValue(*Ptr);
  1383. assert(C < radix && "NumericLiteralParser ctor should have rejected this");
  1384. Val *= radix;
  1385. Val += C;
  1386. if (FoundDecimal)
  1387. // Keep track of how much we will need to adjust this value by from the
  1388. // number of digits past the radix point.
  1389. --FractBaseShift;
  1390. }
  1391. // For a radix of 16, we will be multiplying by 2 instead of 16.
  1392. if (radix == 16) FractBaseShift *= 4;
  1393. BaseShift += FractBaseShift;
  1394. Val <<= Scale;
  1395. uint64_t Base = (radix == 16) ? 2 : 10;
  1396. if (BaseShift > 0) {
  1397. for (int64_t i = 0; i < BaseShift; ++i) {
  1398. Val *= Base;
  1399. }
  1400. } else if (BaseShift < 0) {
  1401. for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
  1402. Val = Val.udiv(Base);
  1403. }
  1404. bool IntOverflowOccurred = false;
  1405. auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
  1406. if (Val.getBitWidth() > StoreVal.getBitWidth()) {
  1407. IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
  1408. StoreVal = Val.trunc(StoreVal.getBitWidth());
  1409. } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
  1410. IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
  1411. StoreVal = Val.zext(StoreVal.getBitWidth());
  1412. } else {
  1413. StoreVal = Val;
  1414. }
  1415. return IntOverflowOccurred || ExpOverflowOccurred;
  1416. }
  1417. /// \verbatim
  1418. /// user-defined-character-literal: [C++11 lex.ext]
  1419. /// character-literal ud-suffix
  1420. /// ud-suffix:
  1421. /// identifier
  1422. /// character-literal: [C++11 lex.ccon]
  1423. /// ' c-char-sequence '
  1424. /// u' c-char-sequence '
  1425. /// U' c-char-sequence '
  1426. /// L' c-char-sequence '
  1427. /// u8' c-char-sequence ' [C++1z lex.ccon]
  1428. /// c-char-sequence:
  1429. /// c-char
  1430. /// c-char-sequence c-char
  1431. /// c-char:
  1432. /// any member of the source character set except the single-quote ',
  1433. /// backslash \, or new-line character
  1434. /// escape-sequence
  1435. /// universal-character-name
  1436. /// escape-sequence:
  1437. /// simple-escape-sequence
  1438. /// octal-escape-sequence
  1439. /// hexadecimal-escape-sequence
  1440. /// simple-escape-sequence:
  1441. /// one of \' \" \? \\ \a \b \f \n \r \t \v
  1442. /// octal-escape-sequence:
  1443. /// \ octal-digit
  1444. /// \ octal-digit octal-digit
  1445. /// \ octal-digit octal-digit octal-digit
  1446. /// hexadecimal-escape-sequence:
  1447. /// \x hexadecimal-digit
  1448. /// hexadecimal-escape-sequence hexadecimal-digit
  1449. /// universal-character-name: [C++11 lex.charset]
  1450. /// \u hex-quad
  1451. /// \U hex-quad hex-quad
  1452. /// hex-quad:
  1453. /// hex-digit hex-digit hex-digit hex-digit
  1454. /// \endverbatim
  1455. ///
  1456. CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
  1457. SourceLocation Loc, Preprocessor &PP,
  1458. tok::TokenKind kind) {
  1459. // At this point we know that the character matches the regex "(L|u|U)?'.*'".
  1460. HadError = false;
  1461. Kind = kind;
  1462. const char *TokBegin = begin;
  1463. // Skip over wide character determinant.
  1464. if (Kind != tok::char_constant)
  1465. ++begin;
  1466. if (Kind == tok::utf8_char_constant)
  1467. ++begin;
  1468. // Skip over the entry quote.
  1469. if (begin[0] != '\'') {
  1470. PP.Diag(Loc, diag::err_lexing_char);
  1471. HadError = true;
  1472. return;
  1473. }
  1474. ++begin;
  1475. // Remove an optional ud-suffix.
  1476. if (end[-1] != '\'') {
  1477. const char *UDSuffixEnd = end;
  1478. do {
  1479. --end;
  1480. } while (end[-1] != '\'');
  1481. // FIXME: Don't bother with this if !tok.hasUCN().
  1482. expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
  1483. UDSuffixOffset = end - TokBegin;
  1484. }
  1485. // Trim the ending quote.
  1486. assert(end != begin && "Invalid token lexed");
  1487. --end;
  1488. // FIXME: The "Value" is an uint64_t so we can handle char literals of
  1489. // up to 64-bits.
  1490. // FIXME: This extensively assumes that 'char' is 8-bits.
  1491. assert(PP.getTargetInfo().getCharWidth() == 8 &&
  1492. "Assumes char is 8 bits");
  1493. assert(PP.getTargetInfo().getIntWidth() <= 64 &&
  1494. (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
  1495. "Assumes sizeof(int) on target is <= 64 and a multiple of char");
  1496. assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
  1497. "Assumes sizeof(wchar) on target is <= 64");
  1498. SmallVector<uint32_t, 4> codepoint_buffer;
  1499. codepoint_buffer.resize(end - begin);
  1500. uint32_t *buffer_begin = &codepoint_buffer.front();
  1501. uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
  1502. // Unicode escapes representing characters that cannot be correctly
  1503. // represented in a single code unit are disallowed in character literals
  1504. // by this implementation.
  1505. uint32_t largest_character_for_kind;
  1506. if (tok::wide_char_constant == Kind) {
  1507. largest_character_for_kind =
  1508. 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
  1509. } else if (tok::utf8_char_constant == Kind) {
  1510. largest_character_for_kind = 0x7F;
  1511. } else if (tok::utf16_char_constant == Kind) {
  1512. largest_character_for_kind = 0xFFFF;
  1513. } else if (tok::utf32_char_constant == Kind) {
  1514. largest_character_for_kind = 0x10FFFF;
  1515. } else {
  1516. largest_character_for_kind = 0x7Fu;
  1517. }
  1518. while (begin != end) {
  1519. // Is this a span of non-escape characters?
  1520. if (begin[0] != '\\') {
  1521. char const *start = begin;
  1522. do {
  1523. ++begin;
  1524. } while (begin != end && *begin != '\\');
  1525. char const *tmp_in_start = start;
  1526. uint32_t *tmp_out_start = buffer_begin;
  1527. llvm::ConversionResult res =
  1528. llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
  1529. reinterpret_cast<llvm::UTF8 const *>(begin),
  1530. &buffer_begin, buffer_end, llvm::strictConversion);
  1531. if (res != llvm::conversionOK) {
  1532. // If we see bad encoding for unprefixed character literals, warn and
  1533. // simply copy the byte values, for compatibility with gcc and
  1534. // older versions of clang.
  1535. bool NoErrorOnBadEncoding = isOrdinary();
  1536. unsigned Msg = diag::err_bad_character_encoding;
  1537. if (NoErrorOnBadEncoding)
  1538. Msg = diag::warn_bad_character_encoding;
  1539. PP.Diag(Loc, Msg);
  1540. if (NoErrorOnBadEncoding) {
  1541. start = tmp_in_start;
  1542. buffer_begin = tmp_out_start;
  1543. for (; start != begin; ++start, ++buffer_begin)
  1544. *buffer_begin = static_cast<uint8_t>(*start);
  1545. } else {
  1546. HadError = true;
  1547. }
  1548. } else {
  1549. for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
  1550. if (*tmp_out_start > largest_character_for_kind) {
  1551. HadError = true;
  1552. PP.Diag(Loc, diag::err_character_too_large);
  1553. }
  1554. }
  1555. }
  1556. continue;
  1557. }
  1558. // Is this a Universal Character Name escape?
  1559. if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
  1560. unsigned short UcnLen = 0;
  1561. if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
  1562. FullSourceLoc(Loc, PP.getSourceManager()),
  1563. &PP.getDiagnostics(), PP.getLangOpts(), true)) {
  1564. HadError = true;
  1565. } else if (*buffer_begin > largest_character_for_kind) {
  1566. HadError = true;
  1567. PP.Diag(Loc, diag::err_character_too_large);
  1568. }
  1569. ++buffer_begin;
  1570. continue;
  1571. }
  1572. unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
  1573. uint64_t result =
  1574. ProcessCharEscape(TokBegin, begin, end, HadError,
  1575. FullSourceLoc(Loc,PP.getSourceManager()),
  1576. CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
  1577. *buffer_begin++ = result;
  1578. }
  1579. unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
  1580. if (NumCharsSoFar > 1) {
  1581. if (isOrdinary() && NumCharsSoFar == 4)
  1582. PP.Diag(Loc, diag::warn_four_char_character_literal);
  1583. else if (isOrdinary())
  1584. PP.Diag(Loc, diag::warn_multichar_character_literal);
  1585. else {
  1586. PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
  1587. HadError = true;
  1588. }
  1589. IsMultiChar = true;
  1590. } else {
  1591. IsMultiChar = false;
  1592. }
  1593. llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
  1594. // Narrow character literals act as though their value is concatenated
  1595. // in this implementation, but warn on overflow.
  1596. bool multi_char_too_long = false;
  1597. if (isOrdinary() && isMultiChar()) {
  1598. LitVal = 0;
  1599. for (size_t i = 0; i < NumCharsSoFar; ++i) {
  1600. // check for enough leading zeros to shift into
  1601. multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
  1602. LitVal <<= 8;
  1603. LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
  1604. }
  1605. } else if (NumCharsSoFar > 0) {
  1606. // otherwise just take the last character
  1607. LitVal = buffer_begin[-1];
  1608. }
  1609. if (!HadError && multi_char_too_long) {
  1610. PP.Diag(Loc, diag::warn_char_constant_too_large);
  1611. }
  1612. // Transfer the value from APInt to uint64_t
  1613. Value = LitVal.getZExtValue();
  1614. // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
  1615. // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
  1616. // character constants are not sign extended in the this implementation:
  1617. // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
  1618. if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
  1619. PP.getLangOpts().CharIsSigned)
  1620. Value = (signed char)Value;
  1621. }
  1622. /// \verbatim
  1623. /// string-literal: [C++0x lex.string]
  1624. /// encoding-prefix " [s-char-sequence] "
  1625. /// encoding-prefix R raw-string
  1626. /// encoding-prefix:
  1627. /// u8
  1628. /// u
  1629. /// U
  1630. /// L
  1631. /// s-char-sequence:
  1632. /// s-char
  1633. /// s-char-sequence s-char
  1634. /// s-char:
  1635. /// any member of the source character set except the double-quote ",
  1636. /// backslash \, or new-line character
  1637. /// escape-sequence
  1638. /// universal-character-name
  1639. /// raw-string:
  1640. /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
  1641. /// r-char-sequence:
  1642. /// r-char
  1643. /// r-char-sequence r-char
  1644. /// r-char:
  1645. /// any member of the source character set, except a right parenthesis )
  1646. /// followed by the initial d-char-sequence (which may be empty)
  1647. /// followed by a double quote ".
  1648. /// d-char-sequence:
  1649. /// d-char
  1650. /// d-char-sequence d-char
  1651. /// d-char:
  1652. /// any member of the basic source character set except:
  1653. /// space, the left parenthesis (, the right parenthesis ),
  1654. /// the backslash \, and the control characters representing horizontal
  1655. /// tab, vertical tab, form feed, and newline.
  1656. /// escape-sequence: [C++0x lex.ccon]
  1657. /// simple-escape-sequence
  1658. /// octal-escape-sequence
  1659. /// hexadecimal-escape-sequence
  1660. /// simple-escape-sequence:
  1661. /// one of \' \" \? \\ \a \b \f \n \r \t \v
  1662. /// octal-escape-sequence:
  1663. /// \ octal-digit
  1664. /// \ octal-digit octal-digit
  1665. /// \ octal-digit octal-digit octal-digit
  1666. /// hexadecimal-escape-sequence:
  1667. /// \x hexadecimal-digit
  1668. /// hexadecimal-escape-sequence hexadecimal-digit
  1669. /// universal-character-name:
  1670. /// \u hex-quad
  1671. /// \U hex-quad hex-quad
  1672. /// hex-quad:
  1673. /// hex-digit hex-digit hex-digit hex-digit
  1674. /// \endverbatim
  1675. ///
  1676. StringLiteralParser::
  1677. StringLiteralParser(ArrayRef<Token> StringToks,
  1678. Preprocessor &PP)
  1679. : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
  1680. Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
  1681. MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
  1682. ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
  1683. init(StringToks);
  1684. }
  1685. void StringLiteralParser::init(ArrayRef<Token> StringToks){
  1686. // The literal token may have come from an invalid source location (e.g. due
  1687. // to a PCH error), in which case the token length will be 0.
  1688. if (StringToks.empty() || StringToks[0].getLength() < 2)
  1689. return DiagnoseLexingError(SourceLocation());
  1690. // Scan all of the string portions, remember the max individual token length,
  1691. // computing a bound on the concatenated string length, and see whether any
  1692. // piece is a wide-string. If any of the string portions is a wide-string
  1693. // literal, the result is a wide-string literal [C99 6.4.5p4].
  1694. assert(!StringToks.empty() && "expected at least one token");
  1695. MaxTokenLength = StringToks[0].getLength();
  1696. assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
  1697. SizeBound = StringToks[0].getLength()-2; // -2 for "".
  1698. Kind = StringToks[0].getKind();
  1699. hadError = false;
  1700. // Implement Translation Phase #6: concatenation of string literals
  1701. /// (C99 5.1.1.2p1). The common case is only one string fragment.
  1702. for (unsigned i = 1; i != StringToks.size(); ++i) {
  1703. if (StringToks[i].getLength() < 2)
  1704. return DiagnoseLexingError(StringToks[i].getLocation());
  1705. // The string could be shorter than this if it needs cleaning, but this is a
  1706. // reasonable bound, which is all we need.
  1707. assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
  1708. SizeBound += StringToks[i].getLength()-2; // -2 for "".
  1709. // Remember maximum string piece length.
  1710. if (StringToks[i].getLength() > MaxTokenLength)
  1711. MaxTokenLength = StringToks[i].getLength();
  1712. // Remember if we see any wide or utf-8/16/32 strings.
  1713. // Also check for illegal concatenations.
  1714. if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
  1715. if (isOrdinary()) {
  1716. Kind = StringToks[i].getKind();
  1717. } else {
  1718. if (Diags)
  1719. Diags->Report(StringToks[i].getLocation(),
  1720. diag::err_unsupported_string_concat);
  1721. hadError = true;
  1722. }
  1723. }
  1724. }
  1725. // Include space for the null terminator.
  1726. ++SizeBound;
  1727. // TODO: K&R warning: "traditional C rejects string constant concatenation"
  1728. // Get the width in bytes of char/wchar_t/char16_t/char32_t
  1729. CharByteWidth = getCharWidth(Kind, Target);
  1730. assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
  1731. CharByteWidth /= 8;
  1732. // The output buffer size needs to be large enough to hold wide characters.
  1733. // This is a worst-case assumption which basically corresponds to L"" "long".
  1734. SizeBound *= CharByteWidth;
  1735. // Size the temporary buffer to hold the result string data.
  1736. ResultBuf.resize(SizeBound);
  1737. // Likewise, but for each string piece.
  1738. SmallString<512> TokenBuf;
  1739. TokenBuf.resize(MaxTokenLength);
  1740. // Loop over all the strings, getting their spelling, and expanding them to
  1741. // wide strings as appropriate.
  1742. ResultPtr = &ResultBuf[0]; // Next byte to fill in.
  1743. Pascal = false;
  1744. SourceLocation UDSuffixTokLoc;
  1745. for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
  1746. const char *ThisTokBuf = &TokenBuf[0];
  1747. // Get the spelling of the token, which eliminates trigraphs, etc. We know
  1748. // that ThisTokBuf points to a buffer that is big enough for the whole token
  1749. // and 'spelled' tokens can only shrink.
  1750. bool StringInvalid = false;
  1751. unsigned ThisTokLen =
  1752. Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
  1753. &StringInvalid);
  1754. if (StringInvalid)
  1755. return DiagnoseLexingError(StringToks[i].getLocation());
  1756. const char *ThisTokBegin = ThisTokBuf;
  1757. const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
  1758. // Remove an optional ud-suffix.
  1759. if (ThisTokEnd[-1] != '"') {
  1760. const char *UDSuffixEnd = ThisTokEnd;
  1761. do {
  1762. --ThisTokEnd;
  1763. } while (ThisTokEnd[-1] != '"');
  1764. StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
  1765. if (UDSuffixBuf.empty()) {
  1766. if (StringToks[i].hasUCN())
  1767. expandUCNs(UDSuffixBuf, UDSuffix);
  1768. else
  1769. UDSuffixBuf.assign(UDSuffix);
  1770. UDSuffixToken = i;
  1771. UDSuffixOffset = ThisTokEnd - ThisTokBuf;
  1772. UDSuffixTokLoc = StringToks[i].getLocation();
  1773. } else {
  1774. SmallString<32> ExpandedUDSuffix;
  1775. if (StringToks[i].hasUCN()) {
  1776. expandUCNs(ExpandedUDSuffix, UDSuffix);
  1777. UDSuffix = ExpandedUDSuffix;
  1778. }
  1779. // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
  1780. // result of a concatenation involving at least one user-defined-string-
  1781. // literal, all the participating user-defined-string-literals shall
  1782. // have the same ud-suffix.
  1783. if (UDSuffixBuf != UDSuffix) {
  1784. if (Diags) {
  1785. SourceLocation TokLoc = StringToks[i].getLocation();
  1786. Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
  1787. << UDSuffixBuf << UDSuffix
  1788. << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
  1789. << SourceRange(TokLoc, TokLoc);
  1790. }
  1791. hadError = true;
  1792. }
  1793. }
  1794. }
  1795. // Strip the end quote.
  1796. --ThisTokEnd;
  1797. // TODO: Input character set mapping support.
  1798. // Skip marker for wide or unicode strings.
  1799. if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
  1800. ++ThisTokBuf;
  1801. // Skip 8 of u8 marker for utf8 strings.
  1802. if (ThisTokBuf[0] == '8')
  1803. ++ThisTokBuf;
  1804. }
  1805. // Check for raw string
  1806. if (ThisTokBuf[0] == 'R') {
  1807. if (ThisTokBuf[1] != '"') {
  1808. // The file may have come from PCH and then changed after loading the
  1809. // PCH; Fail gracefully.
  1810. return DiagnoseLexingError(StringToks[i].getLocation());
  1811. }
  1812. ThisTokBuf += 2; // skip R"
  1813. // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
  1814. // characters.
  1815. constexpr unsigned MaxRawStrDelimLen = 16;
  1816. const char *Prefix = ThisTokBuf;
  1817. while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
  1818. ThisTokBuf[0] != '(')
  1819. ++ThisTokBuf;
  1820. if (ThisTokBuf[0] != '(')
  1821. return DiagnoseLexingError(StringToks[i].getLocation());
  1822. ++ThisTokBuf; // skip '('
  1823. // Remove same number of characters from the end
  1824. ThisTokEnd -= ThisTokBuf - Prefix;
  1825. if (ThisTokEnd < ThisTokBuf)
  1826. return DiagnoseLexingError(StringToks[i].getLocation());
  1827. // C++14 [lex.string]p4: A source-file new-line in a raw string literal
  1828. // results in a new-line in the resulting execution string-literal.
  1829. StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
  1830. while (!RemainingTokenSpan.empty()) {
  1831. // Split the string literal on \r\n boundaries.
  1832. size_t CRLFPos = RemainingTokenSpan.find("\r\n");
  1833. StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
  1834. StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
  1835. // Copy everything before the \r\n sequence into the string literal.
  1836. if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
  1837. hadError = true;
  1838. // Point into the \n inside the \r\n sequence and operate on the
  1839. // remaining portion of the literal.
  1840. RemainingTokenSpan = AfterCRLF.substr(1);
  1841. }
  1842. } else {
  1843. if (ThisTokBuf[0] != '"') {
  1844. // The file may have come from PCH and then changed after loading the
  1845. // PCH; Fail gracefully.
  1846. return DiagnoseLexingError(StringToks[i].getLocation());
  1847. }
  1848. ++ThisTokBuf; // skip "
  1849. // Check if this is a pascal string
  1850. if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
  1851. ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
  1852. // If the \p sequence is found in the first token, we have a pascal string
  1853. // Otherwise, if we already have a pascal string, ignore the first \p
  1854. if (i == 0) {
  1855. ++ThisTokBuf;
  1856. Pascal = true;
  1857. } else if (Pascal)
  1858. ThisTokBuf += 2;
  1859. }
  1860. while (ThisTokBuf != ThisTokEnd) {
  1861. // Is this a span of non-escape characters?
  1862. if (ThisTokBuf[0] != '\\') {
  1863. const char *InStart = ThisTokBuf;
  1864. do {
  1865. ++ThisTokBuf;
  1866. } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
  1867. // Copy the character span over.
  1868. if (CopyStringFragment(StringToks[i], ThisTokBegin,
  1869. StringRef(InStart, ThisTokBuf - InStart)))
  1870. hadError = true;
  1871. continue;
  1872. }
  1873. // Is this a Universal Character Name escape?
  1874. if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
  1875. ThisTokBuf[1] == 'N') {
  1876. EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
  1877. ResultPtr, hadError,
  1878. FullSourceLoc(StringToks[i].getLocation(), SM),
  1879. CharByteWidth, Diags, Features);
  1880. continue;
  1881. }
  1882. // Otherwise, this is a non-UCN escape character. Process it.
  1883. unsigned ResultChar =
  1884. ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
  1885. FullSourceLoc(StringToks[i].getLocation(), SM),
  1886. CharByteWidth*8, Diags, Features);
  1887. if (CharByteWidth == 4) {
  1888. // FIXME: Make the type of the result buffer correct instead of
  1889. // using reinterpret_cast.
  1890. llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
  1891. *ResultWidePtr = ResultChar;
  1892. ResultPtr += 4;
  1893. } else if (CharByteWidth == 2) {
  1894. // FIXME: Make the type of the result buffer correct instead of
  1895. // using reinterpret_cast.
  1896. llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
  1897. *ResultWidePtr = ResultChar & 0xFFFF;
  1898. ResultPtr += 2;
  1899. } else {
  1900. assert(CharByteWidth == 1 && "Unexpected char width");
  1901. *ResultPtr++ = ResultChar & 0xFF;
  1902. }
  1903. }
  1904. }
  1905. }
  1906. if (Pascal) {
  1907. if (CharByteWidth == 4) {
  1908. // FIXME: Make the type of the result buffer correct instead of
  1909. // using reinterpret_cast.
  1910. llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
  1911. ResultWidePtr[0] = GetNumStringChars() - 1;
  1912. } else if (CharByteWidth == 2) {
  1913. // FIXME: Make the type of the result buffer correct instead of
  1914. // using reinterpret_cast.
  1915. llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
  1916. ResultWidePtr[0] = GetNumStringChars() - 1;
  1917. } else {
  1918. assert(CharByteWidth == 1 && "Unexpected char width");
  1919. ResultBuf[0] = GetNumStringChars() - 1;
  1920. }
  1921. // Verify that pascal strings aren't too large.
  1922. if (GetStringLength() > 256) {
  1923. if (Diags)
  1924. Diags->Report(StringToks.front().getLocation(),
  1925. diag::err_pascal_string_too_long)
  1926. << SourceRange(StringToks.front().getLocation(),
  1927. StringToks.back().getLocation());
  1928. hadError = true;
  1929. return;
  1930. }
  1931. } else if (Diags) {
  1932. // Complain if this string literal has too many characters.
  1933. unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
  1934. if (GetNumStringChars() > MaxChars)
  1935. Diags->Report(StringToks.front().getLocation(),
  1936. diag::ext_string_too_long)
  1937. << GetNumStringChars() << MaxChars
  1938. << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
  1939. << SourceRange(StringToks.front().getLocation(),
  1940. StringToks.back().getLocation());
  1941. }
  1942. }
  1943. static const char *resyncUTF8(const char *Err, const char *End) {
  1944. if (Err == End)
  1945. return End;
  1946. End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
  1947. while (++Err != End && (*Err & 0xC0) == 0x80)
  1948. ;
  1949. return Err;
  1950. }
  1951. /// This function copies from Fragment, which is a sequence of bytes
  1952. /// within Tok's contents (which begin at TokBegin) into ResultPtr.
  1953. /// Performs widening for multi-byte characters.
  1954. bool StringLiteralParser::CopyStringFragment(const Token &Tok,
  1955. const char *TokBegin,
  1956. StringRef Fragment) {
  1957. const llvm::UTF8 *ErrorPtrTmp;
  1958. if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
  1959. return false;
  1960. // If we see bad encoding for unprefixed string literals, warn and
  1961. // simply copy the byte values, for compatibility with gcc and older
  1962. // versions of clang.
  1963. bool NoErrorOnBadEncoding = isOrdinary();
  1964. if (NoErrorOnBadEncoding) {
  1965. memcpy(ResultPtr, Fragment.data(), Fragment.size());
  1966. ResultPtr += Fragment.size();
  1967. }
  1968. if (Diags) {
  1969. const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
  1970. FullSourceLoc SourceLoc(Tok.getLocation(), SM);
  1971. const DiagnosticBuilder &Builder =
  1972. Diag(Diags, Features, SourceLoc, TokBegin,
  1973. ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
  1974. NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
  1975. : diag::err_bad_string_encoding);
  1976. const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
  1977. StringRef NextFragment(NextStart, Fragment.end()-NextStart);
  1978. // Decode into a dummy buffer.
  1979. SmallString<512> Dummy;
  1980. Dummy.reserve(Fragment.size() * CharByteWidth);
  1981. char *Ptr = Dummy.data();
  1982. while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
  1983. const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
  1984. NextStart = resyncUTF8(ErrorPtr, Fragment.end());
  1985. Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
  1986. ErrorPtr, NextStart);
  1987. NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
  1988. }
  1989. }
  1990. return !NoErrorOnBadEncoding;
  1991. }
  1992. void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
  1993. hadError = true;
  1994. if (Diags)
  1995. Diags->Report(Loc, diag::err_lexing_string);
  1996. }
  1997. /// getOffsetOfStringByte - This function returns the offset of the
  1998. /// specified byte of the string data represented by Token. This handles
  1999. /// advancing over escape sequences in the string.
  2000. unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
  2001. unsigned ByteNo) const {
  2002. // Get the spelling of the token.
  2003. SmallString<32> SpellingBuffer;
  2004. SpellingBuffer.resize(Tok.getLength());
  2005. bool StringInvalid = false;
  2006. const char *SpellingPtr = &SpellingBuffer[0];
  2007. unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
  2008. &StringInvalid);
  2009. if (StringInvalid)
  2010. return 0;
  2011. const char *SpellingStart = SpellingPtr;
  2012. const char *SpellingEnd = SpellingPtr+TokLen;
  2013. // Handle UTF-8 strings just like narrow strings.
  2014. if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
  2015. SpellingPtr += 2;
  2016. assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
  2017. SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
  2018. // For raw string literals, this is easy.
  2019. if (SpellingPtr[0] == 'R') {
  2020. assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
  2021. // Skip 'R"'.
  2022. SpellingPtr += 2;
  2023. while (*SpellingPtr != '(') {
  2024. ++SpellingPtr;
  2025. assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
  2026. }
  2027. // Skip '('.
  2028. ++SpellingPtr;
  2029. return SpellingPtr - SpellingStart + ByteNo;
  2030. }
  2031. // Skip over the leading quote
  2032. assert(SpellingPtr[0] == '"' && "Should be a string literal!");
  2033. ++SpellingPtr;
  2034. // Skip over bytes until we find the offset we're looking for.
  2035. while (ByteNo) {
  2036. assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
  2037. // Step over non-escapes simply.
  2038. if (*SpellingPtr != '\\') {
  2039. ++SpellingPtr;
  2040. --ByteNo;
  2041. continue;
  2042. }
  2043. // Otherwise, this is an escape character. Advance over it.
  2044. bool HadError = false;
  2045. if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
  2046. SpellingPtr[1] == 'N') {
  2047. const char *EscapePtr = SpellingPtr;
  2048. unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
  2049. 1, Features, HadError);
  2050. if (Len > ByteNo) {
  2051. // ByteNo is somewhere within the escape sequence.
  2052. SpellingPtr = EscapePtr;
  2053. break;
  2054. }
  2055. ByteNo -= Len;
  2056. } else {
  2057. ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
  2058. FullSourceLoc(Tok.getLocation(), SM),
  2059. CharByteWidth*8, Diags, Features);
  2060. --ByteNo;
  2061. }
  2062. assert(!HadError && "This method isn't valid on erroneous strings");
  2063. }
  2064. return SpellingPtr-SpellingStart;
  2065. }
  2066. /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
  2067. /// suffixes as ud-suffixes, because the diagnostic experience is better if we
  2068. /// treat it as an invalid suffix.
  2069. bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
  2070. StringRef Suffix) {
  2071. return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
  2072. Suffix == "sv";
  2073. }