AsmLexer.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958
  1. //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This class implements the lexer for assembly files.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/MC/MCParser/AsmLexer.h"
  13. #include "llvm/ADT/APInt.h"
  14. #include "llvm/ADT/ArrayRef.h"
  15. #include "llvm/ADT/StringExtras.h"
  16. #include "llvm/ADT/StringRef.h"
  17. #include "llvm/ADT/StringSwitch.h"
  18. #include "llvm/MC/MCAsmInfo.h"
  19. #include "llvm/MC/MCParser/MCAsmLexer.h"
  20. #include "llvm/Support/Compiler.h"
  21. #include "llvm/Support/SMLoc.h"
  22. #include "llvm/Support/SaveAndRestore.h"
  23. #include <cassert>
  24. #include <cctype>
  25. #include <cstdio>
  26. #include <cstring>
  27. #include <string>
  28. #include <tuple>
  29. #include <utility>
  30. using namespace llvm;
  31. AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
  32. AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
  33. LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
  34. }
  35. AsmLexer::~AsmLexer() = default;
  36. void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
  37. bool EndStatementAtEOF) {
  38. CurBuf = Buf;
  39. if (ptr)
  40. CurPtr = ptr;
  41. else
  42. CurPtr = CurBuf.begin();
  43. TokStart = nullptr;
  44. this->EndStatementAtEOF = EndStatementAtEOF;
  45. }
  46. /// ReturnError - Set the error to the specified string at the specified
  47. /// location. This is defined to always return AsmToken::Error.
  48. AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
  49. SetError(SMLoc::getFromPointer(Loc), Msg);
  50. return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
  51. }
  52. int AsmLexer::getNextChar() {
  53. if (CurPtr == CurBuf.end())
  54. return EOF;
  55. return (unsigned char)*CurPtr++;
  56. }
  57. int AsmLexer::peekNextChar() {
  58. if (CurPtr == CurBuf.end())
  59. return EOF;
  60. return (unsigned char)*CurPtr;
  61. }
  62. /// The leading integral digit sequence and dot should have already been
  63. /// consumed, some or all of the fractional digit sequence *can* have been
  64. /// consumed.
  65. AsmToken AsmLexer::LexFloatLiteral() {
  66. // Skip the fractional digit sequence.
  67. while (isDigit(*CurPtr))
  68. ++CurPtr;
  69. if (*CurPtr == '-' || *CurPtr == '+')
  70. return ReturnError(CurPtr, "invalid sign in float literal");
  71. // Check for exponent
  72. if ((*CurPtr == 'e' || *CurPtr == 'E')) {
  73. ++CurPtr;
  74. if (*CurPtr == '-' || *CurPtr == '+')
  75. ++CurPtr;
  76. while (isDigit(*CurPtr))
  77. ++CurPtr;
  78. }
  79. return AsmToken(AsmToken::Real,
  80. StringRef(TokStart, CurPtr - TokStart));
  81. }
  82. /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
  83. /// while making sure there are enough actual digits around for the constant to
  84. /// be valid.
  85. ///
  86. /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
  87. /// before we get here.
  88. AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
  89. assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
  90. "unexpected parse state in floating hex");
  91. bool NoFracDigits = true;
  92. // Skip the fractional part if there is one
  93. if (*CurPtr == '.') {
  94. ++CurPtr;
  95. const char *FracStart = CurPtr;
  96. while (isHexDigit(*CurPtr))
  97. ++CurPtr;
  98. NoFracDigits = CurPtr == FracStart;
  99. }
  100. if (NoIntDigits && NoFracDigits)
  101. return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
  102. "expected at least one significand digit");
  103. // Make sure we do have some kind of proper exponent part
  104. if (*CurPtr != 'p' && *CurPtr != 'P')
  105. return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
  106. "expected exponent part 'p'");
  107. ++CurPtr;
  108. if (*CurPtr == '+' || *CurPtr == '-')
  109. ++CurPtr;
  110. // N.b. exponent digits are *not* hex
  111. const char *ExpStart = CurPtr;
  112. while (isDigit(*CurPtr))
  113. ++CurPtr;
  114. if (CurPtr == ExpStart)
  115. return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
  116. "expected at least one exponent digit");
  117. return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
  118. }
  119. /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
  120. static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
  121. return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
  122. (AllowAt && C == '@') || (AllowHash && C == '#');
  123. }
  124. AsmToken AsmLexer::LexIdentifier() {
  125. // Check for floating point literals.
  126. if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
  127. // Disambiguate a .1243foo identifier from a floating literal.
  128. while (isDigit(*CurPtr))
  129. ++CurPtr;
  130. if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
  131. AllowHashInIdentifier) ||
  132. *CurPtr == 'e' || *CurPtr == 'E')
  133. return LexFloatLiteral();
  134. }
  135. while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
  136. ++CurPtr;
  137. // Handle . as a special case.
  138. if (CurPtr == TokStart+1 && TokStart[0] == '.')
  139. return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
  140. return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
  141. }
  142. /// LexSlash: Slash: /
  143. /// C-Style Comment: /* ... */
  144. /// C-style Comment: // ...
  145. AsmToken AsmLexer::LexSlash() {
  146. if (!MAI.shouldAllowAdditionalComments()) {
  147. IsAtStartOfStatement = false;
  148. return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
  149. }
  150. switch (*CurPtr) {
  151. case '*':
  152. IsAtStartOfStatement = false;
  153. break; // C style comment.
  154. case '/':
  155. ++CurPtr;
  156. return LexLineComment();
  157. default:
  158. IsAtStartOfStatement = false;
  159. return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
  160. }
  161. // C Style comment.
  162. ++CurPtr; // skip the star.
  163. const char *CommentTextStart = CurPtr;
  164. while (CurPtr != CurBuf.end()) {
  165. switch (*CurPtr++) {
  166. case '*':
  167. // End of the comment?
  168. if (*CurPtr != '/')
  169. break;
  170. // If we have a CommentConsumer, notify it about the comment.
  171. if (CommentConsumer) {
  172. CommentConsumer->HandleComment(
  173. SMLoc::getFromPointer(CommentTextStart),
  174. StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
  175. }
  176. ++CurPtr; // End the */.
  177. return AsmToken(AsmToken::Comment,
  178. StringRef(TokStart, CurPtr - TokStart));
  179. }
  180. }
  181. return ReturnError(TokStart, "unterminated comment");
  182. }
  183. /// LexLineComment: Comment: #[^\n]*
  184. /// : //[^\n]*
  185. AsmToken AsmLexer::LexLineComment() {
  186. // Mark This as an end of statement with a body of the
  187. // comment. While it would be nicer to leave this two tokens,
  188. // backwards compatability with TargetParsers makes keeping this in this form
  189. // better.
  190. const char *CommentTextStart = CurPtr;
  191. int CurChar = getNextChar();
  192. while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
  193. CurChar = getNextChar();
  194. const char *NewlinePtr = CurPtr;
  195. if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
  196. ++CurPtr;
  197. // If we have a CommentConsumer, notify it about the comment.
  198. if (CommentConsumer) {
  199. CommentConsumer->HandleComment(
  200. SMLoc::getFromPointer(CommentTextStart),
  201. StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
  202. }
  203. IsAtStartOfLine = true;
  204. // This is a whole line comment. leave newline
  205. if (IsAtStartOfStatement)
  206. return AsmToken(AsmToken::EndOfStatement,
  207. StringRef(TokStart, CurPtr - TokStart));
  208. IsAtStartOfStatement = true;
  209. return AsmToken(AsmToken::EndOfStatement,
  210. StringRef(TokStart, CurPtr - 1 - TokStart));
  211. }
  212. static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
  213. // Skip ULL, UL, U, L and LL suffices.
  214. if (CurPtr[0] == 'U')
  215. ++CurPtr;
  216. if (CurPtr[0] == 'L')
  217. ++CurPtr;
  218. if (CurPtr[0] == 'L')
  219. ++CurPtr;
  220. }
  221. // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
  222. // integer as a hexadecimal, possibly with leading zeroes.
  223. static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
  224. bool LexHex) {
  225. const char *FirstNonDec = nullptr;
  226. const char *LookAhead = CurPtr;
  227. while (true) {
  228. if (isDigit(*LookAhead)) {
  229. ++LookAhead;
  230. } else {
  231. if (!FirstNonDec)
  232. FirstNonDec = LookAhead;
  233. // Keep going if we are looking for a 'h' suffix.
  234. if (LexHex && isHexDigit(*LookAhead))
  235. ++LookAhead;
  236. else
  237. break;
  238. }
  239. }
  240. bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
  241. CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
  242. if (isHex)
  243. return 16;
  244. return DefaultRadix;
  245. }
  246. static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
  247. while (hexDigitValue(*CurPtr) < DefaultRadix) {
  248. ++CurPtr;
  249. }
  250. return CurPtr;
  251. }
  252. static AsmToken intToken(StringRef Ref, APInt &Value) {
  253. if (Value.isIntN(64))
  254. return AsmToken(AsmToken::Integer, Ref, Value);
  255. return AsmToken(AsmToken::BigNum, Ref, Value);
  256. }
  257. static std::string radixName(unsigned Radix) {
  258. switch (Radix) {
  259. case 2:
  260. return "binary";
  261. case 8:
  262. return "octal";
  263. case 10:
  264. return "decimal";
  265. case 16:
  266. return "hexadecimal";
  267. default:
  268. return "base-" + std::to_string(Radix);
  269. }
  270. }
  271. /// LexDigit: First character is [0-9].
  272. /// Local Label: [0-9][:]
  273. /// Forward/Backward Label: [0-9][fb]
  274. /// Binary integer: 0b[01]+
  275. /// Octal integer: 0[0-7]+
  276. /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
  277. /// Decimal integer: [1-9][0-9]*
  278. AsmToken AsmLexer::LexDigit() {
  279. // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
  280. // MASM-flavor octal integer: [0-7]+[oOqQ]
  281. // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
  282. // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
  283. if (LexMasmIntegers && isdigit(CurPtr[-1])) {
  284. const char *FirstNonBinary =
  285. (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
  286. const char *FirstNonDecimal =
  287. (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
  288. const char *OldCurPtr = CurPtr;
  289. while (isHexDigit(*CurPtr)) {
  290. switch (*CurPtr) {
  291. default:
  292. if (!FirstNonDecimal) {
  293. FirstNonDecimal = CurPtr;
  294. }
  295. LLVM_FALLTHROUGH;
  296. case '9':
  297. case '8':
  298. case '7':
  299. case '6':
  300. case '5':
  301. case '4':
  302. case '3':
  303. case '2':
  304. if (!FirstNonBinary) {
  305. FirstNonBinary = CurPtr;
  306. }
  307. break;
  308. case '1':
  309. case '0':
  310. break;
  311. }
  312. ++CurPtr;
  313. }
  314. if (*CurPtr == '.') {
  315. // MASM float literals (other than hex floats) always contain a ".", and
  316. // are always written in decimal.
  317. ++CurPtr;
  318. return LexFloatLiteral();
  319. }
  320. if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
  321. ++CurPtr;
  322. return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
  323. }
  324. unsigned Radix = 0;
  325. if (*CurPtr == 'h' || *CurPtr == 'H') {
  326. // hexadecimal number
  327. ++CurPtr;
  328. Radix = 16;
  329. } else if (*CurPtr == 't' || *CurPtr == 'T') {
  330. // decimal number
  331. ++CurPtr;
  332. Radix = 10;
  333. } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
  334. *CurPtr == 'Q') {
  335. // octal number
  336. ++CurPtr;
  337. Radix = 8;
  338. } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
  339. // binary number
  340. ++CurPtr;
  341. Radix = 2;
  342. } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
  343. DefaultRadix < 14 &&
  344. (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
  345. Radix = 10;
  346. } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
  347. DefaultRadix < 12 &&
  348. (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
  349. Radix = 2;
  350. }
  351. if (Radix) {
  352. StringRef Result(TokStart, CurPtr - TokStart);
  353. APInt Value(128, 0, true);
  354. if (Result.drop_back().getAsInteger(Radix, Value))
  355. return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
  356. // MSVC accepts and ignores type suffices on integer literals.
  357. SkipIgnoredIntegerSuffix(CurPtr);
  358. return intToken(Result, Value);
  359. }
  360. // default-radix integers, or floating point numbers, fall through
  361. CurPtr = OldCurPtr;
  362. }
  363. // MASM default-radix integers: [0-9a-fA-F]+
  364. // (All other integer literals have a radix specifier.)
  365. if (LexMasmIntegers && UseMasmDefaultRadix) {
  366. CurPtr = findLastDigit(CurPtr, 16);
  367. StringRef Result(TokStart, CurPtr - TokStart);
  368. APInt Value(128, 0, true);
  369. if (Result.getAsInteger(DefaultRadix, Value)) {
  370. return ReturnError(TokStart,
  371. "invalid " + radixName(DefaultRadix) + " number");
  372. }
  373. return intToken(Result, Value);
  374. }
  375. // Motorola hex integers: $[0-9a-fA-F]+
  376. if (LexMotorolaIntegers && CurPtr[-1] == '$') {
  377. const char *NumStart = CurPtr;
  378. while (isHexDigit(CurPtr[0]))
  379. ++CurPtr;
  380. APInt Result(128, 0);
  381. if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
  382. return ReturnError(TokStart, "invalid hexadecimal number");
  383. return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
  384. }
  385. // Motorola binary integers: %[01]+
  386. if (LexMotorolaIntegers && CurPtr[-1] == '%') {
  387. const char *NumStart = CurPtr;
  388. while (*CurPtr == '0' || *CurPtr == '1')
  389. ++CurPtr;
  390. APInt Result(128, 0);
  391. if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
  392. return ReturnError(TokStart, "invalid binary number");
  393. return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
  394. }
  395. // Decimal integer: [1-9][0-9]*
  396. // HLASM-flavour decimal integer: [0-9][0-9]*
  397. // FIXME: Later on, support for fb for HLASM has to be added in
  398. // as they probably would be needed for asm goto
  399. if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
  400. unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
  401. if (!LexHLASMIntegers) {
  402. bool IsHex = Radix == 16;
  403. // Check for floating point literals.
  404. if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
  405. if (*CurPtr == '.')
  406. ++CurPtr;
  407. return LexFloatLiteral();
  408. }
  409. }
  410. StringRef Result(TokStart, CurPtr - TokStart);
  411. APInt Value(128, 0, true);
  412. if (Result.getAsInteger(Radix, Value))
  413. return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
  414. if (!LexHLASMIntegers)
  415. // The darwin/x86 (and x86-64) assembler accepts and ignores type
  416. // suffices on integer literals.
  417. SkipIgnoredIntegerSuffix(CurPtr);
  418. return intToken(Result, Value);
  419. }
  420. if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
  421. ++CurPtr;
  422. // See if we actually have "0b" as part of something like "jmp 0b\n"
  423. if (!isDigit(CurPtr[0])) {
  424. --CurPtr;
  425. StringRef Result(TokStart, CurPtr - TokStart);
  426. return AsmToken(AsmToken::Integer, Result, 0);
  427. }
  428. const char *NumStart = CurPtr;
  429. while (CurPtr[0] == '0' || CurPtr[0] == '1')
  430. ++CurPtr;
  431. // Requires at least one binary digit.
  432. if (CurPtr == NumStart)
  433. return ReturnError(TokStart, "invalid binary number");
  434. StringRef Result(TokStart, CurPtr - TokStart);
  435. APInt Value(128, 0, true);
  436. if (Result.substr(2).getAsInteger(2, Value))
  437. return ReturnError(TokStart, "invalid binary number");
  438. // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
  439. // suffixes on integer literals.
  440. SkipIgnoredIntegerSuffix(CurPtr);
  441. return intToken(Result, Value);
  442. }
  443. if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
  444. ++CurPtr;
  445. const char *NumStart = CurPtr;
  446. while (isHexDigit(CurPtr[0]))
  447. ++CurPtr;
  448. // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
  449. // diagnosed by LexHexFloatLiteral).
  450. if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
  451. return LexHexFloatLiteral(NumStart == CurPtr);
  452. // Otherwise requires at least one hex digit.
  453. if (CurPtr == NumStart)
  454. return ReturnError(CurPtr-2, "invalid hexadecimal number");
  455. APInt Result(128, 0);
  456. if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
  457. return ReturnError(TokStart, "invalid hexadecimal number");
  458. // Consume the optional [hH].
  459. if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
  460. ++CurPtr;
  461. // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
  462. // suffixes on integer literals.
  463. SkipIgnoredIntegerSuffix(CurPtr);
  464. return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
  465. }
  466. // Either octal or hexadecimal.
  467. APInt Value(128, 0, true);
  468. unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
  469. StringRef Result(TokStart, CurPtr - TokStart);
  470. if (Result.getAsInteger(Radix, Value))
  471. return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
  472. // Consume the [hH].
  473. if (Radix == 16)
  474. ++CurPtr;
  475. // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
  476. // suffixes on integer literals.
  477. SkipIgnoredIntegerSuffix(CurPtr);
  478. return intToken(Result, Value);
  479. }
  480. /// LexSingleQuote: Integer: 'b'
  481. AsmToken AsmLexer::LexSingleQuote() {
  482. int CurChar = getNextChar();
  483. if (LexHLASMStrings)
  484. return ReturnError(TokStart, "invalid usage of character literals");
  485. if (LexMasmStrings) {
  486. while (CurChar != EOF) {
  487. if (CurChar != '\'') {
  488. CurChar = getNextChar();
  489. } else if (peekNextChar() == '\'') {
  490. // In MASM single-quote strings, doubled single-quotes mean an escaped
  491. // single quote, so should be lexed in.
  492. getNextChar();
  493. CurChar = getNextChar();
  494. } else {
  495. break;
  496. }
  497. }
  498. if (CurChar == EOF)
  499. return ReturnError(TokStart, "unterminated string constant");
  500. return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
  501. }
  502. if (CurChar == '\\')
  503. CurChar = getNextChar();
  504. if (CurChar == EOF)
  505. return ReturnError(TokStart, "unterminated single quote");
  506. CurChar = getNextChar();
  507. if (CurChar != '\'')
  508. return ReturnError(TokStart, "single quote way too long");
  509. // The idea here being that 'c' is basically just an integral
  510. // constant.
  511. StringRef Res = StringRef(TokStart,CurPtr - TokStart);
  512. long long Value;
  513. if (Res.startswith("\'\\")) {
  514. char theChar = Res[2];
  515. switch (theChar) {
  516. default: Value = theChar; break;
  517. case '\'': Value = '\''; break;
  518. case 't': Value = '\t'; break;
  519. case 'n': Value = '\n'; break;
  520. case 'b': Value = '\b'; break;
  521. case 'f': Value = '\f'; break;
  522. case 'r': Value = '\r'; break;
  523. }
  524. } else
  525. Value = TokStart[1];
  526. return AsmToken(AsmToken::Integer, Res, Value);
  527. }
  528. /// LexQuote: String: "..."
  529. AsmToken AsmLexer::LexQuote() {
  530. int CurChar = getNextChar();
  531. if (LexHLASMStrings)
  532. return ReturnError(TokStart, "invalid usage of string literals");
  533. if (LexMasmStrings) {
  534. while (CurChar != EOF) {
  535. if (CurChar != '"') {
  536. CurChar = getNextChar();
  537. } else if (peekNextChar() == '"') {
  538. // In MASM double-quoted strings, doubled double-quotes mean an escaped
  539. // double quote, so should be lexed in.
  540. getNextChar();
  541. CurChar = getNextChar();
  542. } else {
  543. break;
  544. }
  545. }
  546. if (CurChar == EOF)
  547. return ReturnError(TokStart, "unterminated string constant");
  548. return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
  549. }
  550. // TODO: does gas allow multiline string constants?
  551. while (CurChar != '"') {
  552. if (CurChar == '\\') {
  553. // Allow \", etc.
  554. CurChar = getNextChar();
  555. }
  556. if (CurChar == EOF)
  557. return ReturnError(TokStart, "unterminated string constant");
  558. CurChar = getNextChar();
  559. }
  560. return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
  561. }
  562. StringRef AsmLexer::LexUntilEndOfStatement() {
  563. TokStart = CurPtr;
  564. while (!isAtStartOfComment(CurPtr) && // Start of line comment.
  565. !isAtStatementSeparator(CurPtr) && // End of statement marker.
  566. *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
  567. ++CurPtr;
  568. }
  569. return StringRef(TokStart, CurPtr-TokStart);
  570. }
  571. StringRef AsmLexer::LexUntilEndOfLine() {
  572. TokStart = CurPtr;
  573. while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
  574. ++CurPtr;
  575. }
  576. return StringRef(TokStart, CurPtr-TokStart);
  577. }
  578. size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
  579. bool ShouldSkipSpace) {
  580. SaveAndRestore<const char *> SavedTokenStart(TokStart);
  581. SaveAndRestore<const char *> SavedCurPtr(CurPtr);
  582. SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine);
  583. SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement);
  584. SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace);
  585. SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true);
  586. std::string SavedErr = getErr();
  587. SMLoc SavedErrLoc = getErrLoc();
  588. size_t ReadCount;
  589. for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
  590. AsmToken Token = LexToken();
  591. Buf[ReadCount] = Token;
  592. if (Token.is(AsmToken::Eof))
  593. break;
  594. }
  595. SetError(SavedErrLoc, SavedErr);
  596. return ReadCount;
  597. }
  598. bool AsmLexer::isAtStartOfComment(const char *Ptr) {
  599. if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
  600. return false;
  601. StringRef CommentString = MAI.getCommentString();
  602. if (CommentString.size() == 1)
  603. return CommentString[0] == Ptr[0];
  604. // Allow # preprocessor commments also be counted as comments for "##" cases
  605. if (CommentString[1] == '#')
  606. return CommentString[0] == Ptr[0];
  607. return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
  608. }
  609. bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
  610. return strncmp(Ptr, MAI.getSeparatorString(),
  611. strlen(MAI.getSeparatorString())) == 0;
  612. }
  613. AsmToken AsmLexer::LexToken() {
  614. TokStart = CurPtr;
  615. // This always consumes at least one character.
  616. int CurChar = getNextChar();
  617. if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
  618. // If this starts with a '#', this may be a cpp
  619. // hash directive and otherwise a line comment.
  620. AsmToken TokenBuf[2];
  621. MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
  622. size_t num = peekTokens(Buf, true);
  623. // There cannot be a space preceding this
  624. if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
  625. TokenBuf[1].is(AsmToken::String)) {
  626. CurPtr = TokStart; // reset curPtr;
  627. StringRef s = LexUntilEndOfLine();
  628. UnLex(TokenBuf[1]);
  629. UnLex(TokenBuf[0]);
  630. return AsmToken(AsmToken::HashDirective, s);
  631. }
  632. if (MAI.shouldAllowAdditionalComments())
  633. return LexLineComment();
  634. }
  635. if (isAtStartOfComment(TokStart))
  636. return LexLineComment();
  637. if (isAtStatementSeparator(TokStart)) {
  638. CurPtr += strlen(MAI.getSeparatorString()) - 1;
  639. IsAtStartOfLine = true;
  640. IsAtStartOfStatement = true;
  641. return AsmToken(AsmToken::EndOfStatement,
  642. StringRef(TokStart, strlen(MAI.getSeparatorString())));
  643. }
  644. // If we're missing a newline at EOF, make sure we still get an
  645. // EndOfStatement token before the Eof token.
  646. if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
  647. IsAtStartOfLine = true;
  648. IsAtStartOfStatement = true;
  649. return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
  650. }
  651. IsAtStartOfLine = false;
  652. bool OldIsAtStartOfStatement = IsAtStartOfStatement;
  653. IsAtStartOfStatement = false;
  654. switch (CurChar) {
  655. default:
  656. // Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]*
  657. if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' ||
  658. (MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?'))
  659. return LexIdentifier();
  660. // Unknown character, emit an error.
  661. return ReturnError(TokStart, "invalid character in input");
  662. case EOF:
  663. if (EndStatementAtEOF) {
  664. IsAtStartOfLine = true;
  665. IsAtStartOfStatement = true;
  666. }
  667. return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
  668. case 0:
  669. case ' ':
  670. case '\t':
  671. IsAtStartOfStatement = OldIsAtStartOfStatement;
  672. while (*CurPtr == ' ' || *CurPtr == '\t')
  673. CurPtr++;
  674. if (SkipSpace)
  675. return LexToken(); // Ignore whitespace.
  676. else
  677. return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
  678. case '\r': {
  679. IsAtStartOfLine = true;
  680. IsAtStartOfStatement = true;
  681. // If this is a CR followed by LF, treat that as one token.
  682. if (CurPtr != CurBuf.end() && *CurPtr == '\n')
  683. ++CurPtr;
  684. return AsmToken(AsmToken::EndOfStatement,
  685. StringRef(TokStart, CurPtr - TokStart));
  686. }
  687. case '\n':
  688. IsAtStartOfLine = true;
  689. IsAtStartOfStatement = true;
  690. return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
  691. case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
  692. case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
  693. case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
  694. case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
  695. case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
  696. case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
  697. case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
  698. case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
  699. case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
  700. case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
  701. case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
  702. case '$': {
  703. if (LexMotorolaIntegers && isHexDigit(*CurPtr))
  704. return LexDigit();
  705. if (MAI.doesAllowDollarAtStartOfIdentifier())
  706. return LexIdentifier();
  707. return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
  708. }
  709. case '@': {
  710. if (MAI.doesAllowAtAtStartOfIdentifier())
  711. return LexIdentifier();
  712. return AsmToken(AsmToken::At, StringRef(TokStart, 1));
  713. }
  714. case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
  715. case '=':
  716. if (*CurPtr == '=') {
  717. ++CurPtr;
  718. return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
  719. }
  720. return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
  721. case '-':
  722. if (*CurPtr == '>') {
  723. ++CurPtr;
  724. return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
  725. }
  726. return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
  727. case '|':
  728. if (*CurPtr == '|') {
  729. ++CurPtr;
  730. return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
  731. }
  732. return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
  733. case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
  734. case '&':
  735. if (*CurPtr == '&') {
  736. ++CurPtr;
  737. return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
  738. }
  739. return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
  740. case '!':
  741. if (*CurPtr == '=') {
  742. ++CurPtr;
  743. return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
  744. }
  745. return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
  746. case '%':
  747. if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
  748. return LexDigit();
  749. }
  750. if (MAI.hasMipsExpressions()) {
  751. AsmToken::TokenKind Operator;
  752. unsigned OperatorLength;
  753. std::tie(Operator, OperatorLength) =
  754. StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
  755. StringRef(CurPtr))
  756. .StartsWith("call16", {AsmToken::PercentCall16, 7})
  757. .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
  758. .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
  759. .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
  760. .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
  761. .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
  762. .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
  763. .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
  764. .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
  765. .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
  766. .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
  767. .StartsWith("got", {AsmToken::PercentGot, 4})
  768. .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
  769. .StartsWith("higher", {AsmToken::PercentHigher, 7})
  770. .StartsWith("highest", {AsmToken::PercentHighest, 8})
  771. .StartsWith("hi", {AsmToken::PercentHi, 3})
  772. .StartsWith("lo", {AsmToken::PercentLo, 3})
  773. .StartsWith("neg", {AsmToken::PercentNeg, 4})
  774. .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
  775. .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
  776. .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
  777. .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
  778. .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
  779. .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
  780. .Default({AsmToken::Percent, 1});
  781. if (Operator != AsmToken::Percent) {
  782. CurPtr += OperatorLength - 1;
  783. return AsmToken(Operator, StringRef(TokStart, OperatorLength));
  784. }
  785. }
  786. return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
  787. case '/':
  788. IsAtStartOfStatement = OldIsAtStartOfStatement;
  789. return LexSlash();
  790. case '#': {
  791. if (MAI.doesAllowHashAtStartOfIdentifier())
  792. return LexIdentifier();
  793. return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
  794. }
  795. case '\'': return LexSingleQuote();
  796. case '"': return LexQuote();
  797. case '0': case '1': case '2': case '3': case '4':
  798. case '5': case '6': case '7': case '8': case '9':
  799. return LexDigit();
  800. case '<':
  801. switch (*CurPtr) {
  802. case '<':
  803. ++CurPtr;
  804. return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
  805. case '=':
  806. ++CurPtr;
  807. return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
  808. case '>':
  809. ++CurPtr;
  810. return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
  811. default:
  812. return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
  813. }
  814. case '>':
  815. switch (*CurPtr) {
  816. case '>':
  817. ++CurPtr;
  818. return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
  819. case '=':
  820. ++CurPtr;
  821. return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
  822. default:
  823. return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
  824. }
  825. // TODO: Quoted identifiers (objc methods etc)
  826. // local labels: [0-9][:]
  827. // Forward/backward labels: [0-9][fb]
  828. // Integers, fp constants, character constants.
  829. }
  830. }