Lexer.h 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // This file defines the Lexer interface.
  15. //
  16. //===----------------------------------------------------------------------===//
  17. #ifndef LLVM_CLANG_LEX_LEXER_H
  18. #define LLVM_CLANG_LEX_LEXER_H
  19. #include "clang/Basic/LangOptions.h"
  20. #include "clang/Basic/SourceLocation.h"
  21. #include "clang/Basic/TokenKinds.h"
  22. #include "clang/Lex/DependencyDirectivesScanner.h"
  23. #include "clang/Lex/PreprocessorLexer.h"
  24. #include "clang/Lex/Token.h"
  25. #include "llvm/ADT/SmallVector.h"
  26. #include "llvm/ADT/StringRef.h"
  27. #include <cassert>
  28. #include <cstdint>
  29. #include <optional>
  30. #include <string>
  31. namespace llvm {
  32. class MemoryBufferRef;
  33. } // namespace llvm
  34. namespace clang {
  35. class DiagnosticBuilder;
  36. class Preprocessor;
  37. class SourceManager;
  38. class LangOptions;
  39. /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
  40. /// recovering from.
  41. enum ConflictMarkerKind {
  42. /// Not within a conflict marker.
  43. CMK_None,
  44. /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
  45. /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
  46. CMK_Normal,
  47. /// A Perforce-style conflict marker, initiated by 4 ">"s,
  48. /// separated by 4 "="s, and terminated by 4 "<"s.
  49. CMK_Perforce
  50. };
  51. /// Describes the bounds (start, size) of the preamble and a flag required by
  52. /// PreprocessorOptions::PrecompiledPreambleBytes.
  53. /// The preamble includes the BOM, if any.
  54. struct PreambleBounds {
  55. /// Size of the preamble in bytes.
  56. unsigned Size;
  57. /// Whether the preamble ends at the start of a new line.
  58. ///
  59. /// Used to inform the lexer as to whether it's starting at the beginning of
  60. /// a line after skipping the preamble.
  61. bool PreambleEndsAtStartOfLine;
  62. PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
  63. : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
  64. };
  65. /// Lexer - This provides a simple interface that turns a text buffer into a
  66. /// stream of tokens. This provides no support for file reading or buffering,
  67. /// or buffering/seeking of tokens, only forward lexing is supported. It relies
  68. /// on the specified Preprocessor object to handle preprocessor directives, etc.
  69. class Lexer : public PreprocessorLexer {
  70. friend class Preprocessor;
  71. void anchor() override;
  72. //===--------------------------------------------------------------------===//
  73. // Constant configuration values for this lexer.
  74. // Start of the buffer.
  75. const char *BufferStart;
  76. // End of the buffer.
  77. const char *BufferEnd;
  78. // Location for start of file.
  79. SourceLocation FileLoc;
  80. // LangOpts enabled by this language.
  81. // Storing LangOptions as reference here is important from performance point
  82. // of view. Lack of reference means that LangOptions copy constructor would be
  83. // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
  84. // Lexer objects are created thousands times (in Lexer::getRawToken,
  85. // Preprocessor::EnterSourceFile and other places) during single module
  86. // processing in frontend it would make std::vector<std::string> copy
  87. // constructors surprisingly hot.
  88. const LangOptions &LangOpts;
  89. // True if '//' line comments are enabled.
  90. bool LineComment;
  91. // True if lexer for _Pragma handling.
  92. bool Is_PragmaLexer;
  93. //===--------------------------------------------------------------------===//
  94. // Context-specific lexing flags set by the preprocessor.
  95. //
  96. /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
  97. /// and return them as tokens. This is used for -C and -CC modes, and
  98. /// whitespace preservation can be useful for some clients that want to lex
  99. /// the file in raw mode and get every character from the file.
  100. ///
  101. /// When this is set to 2 it returns comments and whitespace. When set to 1
  102. /// it returns comments, when it is set to 0 it returns normal tokens only.
  103. unsigned char ExtendedTokenMode;
  104. //===--------------------------------------------------------------------===//
  105. // Context that changes as the file is lexed.
  106. // NOTE: any state that mutates when in raw mode must have save/restore code
  107. // in Lexer::isNextPPTokenLParen.
  108. // BufferPtr - Current pointer into the buffer. This is the next character
  109. // to be lexed.
  110. const char *BufferPtr;
  111. // IsAtStartOfLine - True if the next lexed token should get the "start of
  112. // line" flag set on it.
  113. bool IsAtStartOfLine;
  114. bool IsAtPhysicalStartOfLine;
  115. bool HasLeadingSpace;
  116. bool HasLeadingEmptyMacro;
  117. /// True if this is the first time we're lexing the input file.
  118. bool IsFirstTimeLexingFile;
  119. // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
  120. // it also points to '\n.'
  121. const char *NewLinePtr;
  122. // CurrentConflictMarkerState - The kind of conflict marker we are handling.
  123. ConflictMarkerKind CurrentConflictMarkerState;
  124. /// Non-empty if this \p Lexer is \p isDependencyDirectivesLexer().
  125. ArrayRef<dependency_directives_scan::Directive> DepDirectives;
  126. /// If this \p Lexer is \p isDependencyDirectivesLexer(), it represents the
  127. /// next token to use from the current dependency directive.
  128. unsigned NextDepDirectiveTokenIndex = 0;
  129. void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
  130. public:
  131. /// Lexer constructor - Create a new lexer object for the specified buffer
  132. /// with the specified preprocessor managing the lexing process. This lexer
  133. /// assumes that the associated file buffer and Preprocessor objects will
  134. /// outlive it, so it doesn't take ownership of either of them.
  135. Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
  136. bool IsFirstIncludeOfFile = true);
  137. /// Lexer constructor - Create a new raw lexer object. This object is only
  138. /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
  139. /// text range will outlive it, so it doesn't take ownership of it.
  140. Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
  141. const char *BufStart, const char *BufPtr, const char *BufEnd,
  142. bool IsFirstIncludeOfFile = true);
  143. /// Lexer constructor - Create a new raw lexer object. This object is only
  144. /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
  145. /// text range will outlive it, so it doesn't take ownership of it.
  146. Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
  147. const SourceManager &SM, const LangOptions &LangOpts,
  148. bool IsFirstIncludeOfFile = true);
  149. Lexer(const Lexer &) = delete;
  150. Lexer &operator=(const Lexer &) = delete;
  151. /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
  152. /// _Pragma expansion. This has a variety of magic semantics that this method
  153. /// sets up. It returns a new'd Lexer that must be delete'd when done.
  154. static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
  155. SourceLocation ExpansionLocStart,
  156. SourceLocation ExpansionLocEnd,
  157. unsigned TokLen, Preprocessor &PP);
  158. /// getFileLoc - Return the File Location for the file we are lexing out of.
  159. /// The physical location encodes the location where the characters come from,
  160. /// the virtual location encodes where we should *claim* the characters came
  161. /// from. Currently this is only used by _Pragma handling.
  162. SourceLocation getFileLoc() const { return FileLoc; }
  163. private:
  164. /// Lex - Return the next token in the file. If this is the end of file, it
  165. /// return the tok::eof token. This implicitly involves the preprocessor.
  166. bool Lex(Token &Result);
  167. /// Called when the preprocessor is in 'dependency scanning lexing mode'.
  168. bool LexDependencyDirectiveToken(Token &Result);
  169. /// Called when the preprocessor is in 'dependency scanning lexing mode' and
  170. /// is skipping a conditional block.
  171. bool LexDependencyDirectiveTokenWhileSkipping(Token &Result);
  172. /// True when the preprocessor is in 'dependency scanning lexing mode' and
  173. /// created this \p Lexer for lexing a set of dependency directive tokens.
  174. bool isDependencyDirectivesLexer() const { return !DepDirectives.empty(); }
  175. /// Initializes \p Result with data from \p DDTok and advances \p BufferPtr to
  176. /// the position just after the token.
  177. /// \returns the buffer pointer at the beginning of the token.
  178. const char *convertDependencyDirectiveToken(
  179. const dependency_directives_scan::Token &DDTok, Token &Result);
  180. public:
  181. /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
  182. bool isPragmaLexer() const { return Is_PragmaLexer; }
  183. private:
  184. /// IndirectLex - An indirect call to 'Lex' that can be invoked via
  185. /// the PreprocessorLexer interface.
  186. void IndirectLex(Token &Result) override { Lex(Result); }
  187. public:
  188. /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
  189. /// associated preprocessor object. Return true if the 'next character to
  190. /// read' pointer points at the end of the lexer buffer, false otherwise.
  191. bool LexFromRawLexer(Token &Result) {
  192. assert(LexingRawMode && "Not already in raw mode!");
  193. Lex(Result);
  194. // Note that lexing to the end of the buffer doesn't implicitly delete the
  195. // lexer when in raw mode.
  196. return BufferPtr == BufferEnd;
  197. }
  198. /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
  199. /// every character in the file, including whitespace and comments. This
  200. /// should only be used in raw mode, as the preprocessor is not prepared to
  201. /// deal with the excess tokens.
  202. bool isKeepWhitespaceMode() const {
  203. return ExtendedTokenMode > 1;
  204. }
  205. /// SetKeepWhitespaceMode - This method lets clients enable or disable
  206. /// whitespace retention mode.
  207. void SetKeepWhitespaceMode(bool Val) {
  208. assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
  209. "Can only retain whitespace in raw mode or -traditional-cpp");
  210. ExtendedTokenMode = Val ? 2 : 0;
  211. }
  212. /// inKeepCommentMode - Return true if the lexer should return comments as
  213. /// tokens.
  214. bool inKeepCommentMode() const {
  215. return ExtendedTokenMode > 0;
  216. }
  217. /// SetCommentRetentionMode - Change the comment retention mode of the lexer
  218. /// to the specified mode. This is really only useful when lexing in raw
  219. /// mode, because otherwise the lexer needs to manage this.
  220. void SetCommentRetentionState(bool Mode) {
  221. assert(!isKeepWhitespaceMode() &&
  222. "Can't play with comment retention state when retaining whitespace");
  223. ExtendedTokenMode = Mode ? 1 : 0;
  224. }
  225. /// Sets the extended token mode back to its initial value, according to the
  226. /// language options and preprocessor. This controls whether the lexer
  227. /// produces comment and whitespace tokens.
  228. ///
  229. /// This requires the lexer to have an associated preprocessor. A standalone
  230. /// lexer has nothing to reset to.
  231. void resetExtendedTokenMode();
  232. /// Gets source code buffer.
  233. StringRef getBuffer() const {
  234. return StringRef(BufferStart, BufferEnd - BufferStart);
  235. }
  236. /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
  237. /// uninterpreted string. This switches the lexer out of directive mode.
  238. void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
  239. /// Diag - Forwarding function for diagnostics. This translate a source
  240. /// position in the current buffer into a SourceLocation object for rendering.
  241. DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
  242. /// getSourceLocation - Return a source location identifier for the specified
  243. /// offset in the current file.
  244. SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
  245. /// getSourceLocation - Return a source location for the next character in
  246. /// the current file.
  247. SourceLocation getSourceLocation() override {
  248. return getSourceLocation(BufferPtr);
  249. }
  250. /// Return the current location in the buffer.
  251. const char *getBufferLocation() const { return BufferPtr; }
  252. /// Returns the current lexing offset.
  253. unsigned getCurrentBufferOffset() {
  254. assert(BufferPtr >= BufferStart && "Invalid buffer state");
  255. return BufferPtr - BufferStart;
  256. }
  257. /// Set the lexer's buffer pointer to \p Offset.
  258. void seek(unsigned Offset, bool IsAtStartOfLine);
  259. /// Stringify - Convert the specified string into a C string by i) escaping
  260. /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
  261. /// If Charify is true, this escapes the ' character instead of ".
  262. static std::string Stringify(StringRef Str, bool Charify = false);
  263. /// Stringify - Convert the specified string into a C string by i) escaping
  264. /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
  265. static void Stringify(SmallVectorImpl<char> &Str);
  266. /// getSpelling - This method is used to get the spelling of a token into a
  267. /// preallocated buffer, instead of as an std::string. The caller is required
  268. /// to allocate enough space for the token, which is guaranteed to be at least
  269. /// Tok.getLength() bytes long. The length of the actual result is returned.
  270. ///
  271. /// Note that this method may do two possible things: it may either fill in
  272. /// the buffer specified with characters, or it may *change the input pointer*
  273. /// to point to a constant buffer with the data already in it (avoiding a
  274. /// copy). The caller is not allowed to modify the returned buffer pointer
  275. /// if an internal buffer is returned.
  276. static unsigned getSpelling(const Token &Tok, const char *&Buffer,
  277. const SourceManager &SourceMgr,
  278. const LangOptions &LangOpts,
  279. bool *Invalid = nullptr);
  280. /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
  281. /// token is the characters used to represent the token in the source file
  282. /// after trigraph expansion and escaped-newline folding. In particular, this
  283. /// wants to get the true, uncanonicalized, spelling of things like digraphs
  284. /// UCNs, etc.
  285. static std::string getSpelling(const Token &Tok,
  286. const SourceManager &SourceMgr,
  287. const LangOptions &LangOpts,
  288. bool *Invalid = nullptr);
  289. /// getSpelling - This method is used to get the spelling of the
  290. /// token at the given source location. If, as is usually true, it
  291. /// is not necessary to copy any data, then the returned string may
  292. /// not point into the provided buffer.
  293. ///
  294. /// This method lexes at the expansion depth of the given
  295. /// location and does not jump to the expansion or spelling
  296. /// location.
  297. static StringRef getSpelling(SourceLocation loc,
  298. SmallVectorImpl<char> &buffer,
  299. const SourceManager &SM,
  300. const LangOptions &options,
  301. bool *invalid = nullptr);
  302. /// MeasureTokenLength - Relex the token at the specified location and return
  303. /// its length in bytes in the input file. If the token needs cleaning (e.g.
  304. /// includes a trigraph or an escaped newline) then this count includes bytes
  305. /// that are part of that.
  306. static unsigned MeasureTokenLength(SourceLocation Loc,
  307. const SourceManager &SM,
  308. const LangOptions &LangOpts);
  309. /// Relex the token at the specified location.
  310. /// \returns true if there was a failure, false on success.
  311. static bool getRawToken(SourceLocation Loc, Token &Result,
  312. const SourceManager &SM,
  313. const LangOptions &LangOpts,
  314. bool IgnoreWhiteSpace = false);
  315. /// Given a location any where in a source buffer, find the location
  316. /// that corresponds to the beginning of the token in which the original
  317. /// source location lands.
  318. static SourceLocation GetBeginningOfToken(SourceLocation Loc,
  319. const SourceManager &SM,
  320. const LangOptions &LangOpts);
  321. /// Get the physical length (including trigraphs and escaped newlines) of the
  322. /// first \p Characters characters of the token starting at TokStart.
  323. static unsigned getTokenPrefixLength(SourceLocation TokStart,
  324. unsigned CharNo,
  325. const SourceManager &SM,
  326. const LangOptions &LangOpts);
  327. /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
  328. /// location at the start of a token, return a new location that specifies a
  329. /// character within the token. This handles trigraphs and escaped newlines.
  330. static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
  331. unsigned Characters,
  332. const SourceManager &SM,
  333. const LangOptions &LangOpts) {
  334. return TokStart.getLocWithOffset(
  335. getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
  336. }
  337. /// Computes the source location just past the end of the
  338. /// token at this source location.
  339. ///
  340. /// This routine can be used to produce a source location that
  341. /// points just past the end of the token referenced by \p Loc, and
  342. /// is generally used when a diagnostic needs to point just after a
  343. /// token where it expected something different that it received. If
  344. /// the returned source location would not be meaningful (e.g., if
  345. /// it points into a macro), this routine returns an invalid
  346. /// source location.
  347. ///
  348. /// \param Offset an offset from the end of the token, where the source
  349. /// location should refer to. The default offset (0) produces a source
  350. /// location pointing just past the end of the token; an offset of 1 produces
  351. /// a source location pointing to the last character in the token, etc.
  352. static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
  353. const SourceManager &SM,
  354. const LangOptions &LangOpts);
  355. /// Given a token range, produce a corresponding CharSourceRange that
  356. /// is not a token range. This allows the source range to be used by
  357. /// components that don't have access to the lexer and thus can't find the
  358. /// end of the range for themselves.
  359. static CharSourceRange getAsCharRange(SourceRange Range,
  360. const SourceManager &SM,
  361. const LangOptions &LangOpts) {
  362. SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
  363. return End.isInvalid() ? CharSourceRange()
  364. : CharSourceRange::getCharRange(
  365. Range.getBegin(), End);
  366. }
  367. static CharSourceRange getAsCharRange(CharSourceRange Range,
  368. const SourceManager &SM,
  369. const LangOptions &LangOpts) {
  370. return Range.isTokenRange()
  371. ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
  372. : Range;
  373. }
  374. /// Returns true if the given MacroID location points at the first
  375. /// token of the macro expansion.
  376. ///
  377. /// \param MacroBegin If non-null and function returns true, it is set to
  378. /// begin location of the macro.
  379. static bool isAtStartOfMacroExpansion(SourceLocation loc,
  380. const SourceManager &SM,
  381. const LangOptions &LangOpts,
  382. SourceLocation *MacroBegin = nullptr);
  383. /// Returns true if the given MacroID location points at the last
  384. /// token of the macro expansion.
  385. ///
  386. /// \param MacroEnd If non-null and function returns true, it is set to
  387. /// end location of the macro.
  388. static bool isAtEndOfMacroExpansion(SourceLocation loc,
  389. const SourceManager &SM,
  390. const LangOptions &LangOpts,
  391. SourceLocation *MacroEnd = nullptr);
  392. /// Accepts a range and returns a character range with file locations.
  393. ///
  394. /// Returns a null range if a part of the range resides inside a macro
  395. /// expansion or the range does not reside on the same FileID.
  396. ///
  397. /// This function is trying to deal with macros and return a range based on
  398. /// file locations. The cases where it can successfully handle macros are:
  399. ///
  400. /// -begin or end range lies at the start or end of a macro expansion, in
  401. /// which case the location will be set to the expansion point, e.g:
  402. /// \#define M 1 2
  403. /// a M
  404. /// If you have a range [a, 2] (where 2 came from the macro), the function
  405. /// will return a range for "a M"
  406. /// if you have range [a, 1], the function will fail because the range
  407. /// overlaps with only a part of the macro
  408. ///
  409. /// -The macro is a function macro and the range can be mapped to the macro
  410. /// arguments, e.g:
  411. /// \#define M 1 2
  412. /// \#define FM(x) x
  413. /// FM(a b M)
  414. /// if you have range [b, 2], the function will return the file range "b M"
  415. /// inside the macro arguments.
  416. /// if you have range [a, 2], the function will return the file range
  417. /// "FM(a b M)" since the range includes all of the macro expansion.
  418. static CharSourceRange makeFileCharRange(CharSourceRange Range,
  419. const SourceManager &SM,
  420. const LangOptions &LangOpts);
  421. /// Returns a string for the source that the range encompasses.
  422. static StringRef getSourceText(CharSourceRange Range,
  423. const SourceManager &SM,
  424. const LangOptions &LangOpts,
  425. bool *Invalid = nullptr);
  426. /// Retrieve the name of the immediate macro expansion.
  427. ///
  428. /// This routine starts from a source location, and finds the name of the macro
  429. /// responsible for its immediate expansion. It looks through any intervening
  430. /// macro argument expansions to compute this. It returns a StringRef which
  431. /// refers to the SourceManager-owned buffer of the source where that macro
  432. /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
  433. static StringRef getImmediateMacroName(SourceLocation Loc,
  434. const SourceManager &SM,
  435. const LangOptions &LangOpts);
  436. /// Retrieve the name of the immediate macro expansion.
  437. ///
  438. /// This routine starts from a source location, and finds the name of the
  439. /// macro responsible for its immediate expansion. It looks through any
  440. /// intervening macro argument expansions to compute this. It returns a
  441. /// StringRef which refers to the SourceManager-owned buffer of the source
  442. /// where that macro name is spelled. Thus, the result shouldn't out-live
  443. /// that SourceManager.
  444. ///
  445. /// This differs from Lexer::getImmediateMacroName in that any macro argument
  446. /// location will result in the topmost function macro that accepted it.
  447. /// e.g.
  448. /// \code
  449. /// MAC1( MAC2(foo) )
  450. /// \endcode
  451. /// for location of 'foo' token, this function will return "MAC1" while
  452. /// Lexer::getImmediateMacroName will return "MAC2".
  453. static StringRef getImmediateMacroNameForDiagnostics(
  454. SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
  455. /// Compute the preamble of the given file.
  456. ///
  457. /// The preamble of a file contains the initial comments, include directives,
  458. /// and other preprocessor directives that occur before the code in this
  459. /// particular file actually begins. The preamble of the main source file is
  460. /// a potential prefix header.
  461. ///
  462. /// \param Buffer The memory buffer containing the file's contents.
  463. ///
  464. /// \param MaxLines If non-zero, restrict the length of the preamble
  465. /// to fewer than this number of lines.
  466. ///
  467. /// \returns The offset into the file where the preamble ends and the rest
  468. /// of the file begins along with a boolean value indicating whether
  469. /// the preamble ends at the beginning of a new line.
  470. static PreambleBounds ComputePreamble(StringRef Buffer,
  471. const LangOptions &LangOpts,
  472. unsigned MaxLines = 0);
  473. /// Finds the token that comes right after the given location.
  474. ///
  475. /// Returns the next token, or none if the location is inside a macro.
  476. static std::optional<Token> findNextToken(SourceLocation Loc,
  477. const SourceManager &SM,
  478. const LangOptions &LangOpts);
  479. /// Checks that the given token is the first token that occurs after
  480. /// the given location (this excludes comments and whitespace). Returns the
  481. /// location immediately after the specified token. If the token is not found
  482. /// or the location is inside a macro, the returned source location will be
  483. /// invalid.
  484. static SourceLocation findLocationAfterToken(SourceLocation loc,
  485. tok::TokenKind TKind,
  486. const SourceManager &SM,
  487. const LangOptions &LangOpts,
  488. bool SkipTrailingWhitespaceAndNewLine);
  489. /// Returns true if the given character could appear in an identifier.
  490. static bool isAsciiIdentifierContinueChar(char c,
  491. const LangOptions &LangOpts);
  492. /// Checks whether new line pointed by Str is preceded by escape
  493. /// sequence.
  494. static bool isNewLineEscaped(const char *BufferStart, const char *Str);
  495. /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
  496. /// emit a warning.
  497. static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
  498. const LangOptions &LangOpts) {
  499. // If this is not a trigraph and not a UCN or escaped newline, return
  500. // quickly.
  501. if (isObviouslySimpleCharacter(Ptr[0])) {
  502. Size = 1;
  503. return *Ptr;
  504. }
  505. Size = 0;
  506. return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
  507. }
  508. /// Returns the leading whitespace for line that corresponds to the given
  509. /// location \p Loc.
  510. static StringRef getIndentationForLine(SourceLocation Loc,
  511. const SourceManager &SM);
  512. /// Check if this is the first time we're lexing the input file.
  513. bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; }
  514. private:
  515. //===--------------------------------------------------------------------===//
  516. // Internal implementation interfaces.
  517. /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
  518. /// by Lex.
  519. ///
  520. bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
  521. bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
  522. bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
  523. /// FormTokenWithChars - When we lex a token, we have identified a span
  524. /// starting at BufferPtr, going to TokEnd that forms the token. This method
  525. /// takes that range and assigns it to the token as its location and size. In
  526. /// addition, since tokens cannot overlap, this also updates BufferPtr to be
  527. /// TokEnd.
  528. void FormTokenWithChars(Token &Result, const char *TokEnd,
  529. tok::TokenKind Kind) {
  530. unsigned TokLen = TokEnd-BufferPtr;
  531. Result.setLength(TokLen);
  532. Result.setLocation(getSourceLocation(BufferPtr, TokLen));
  533. Result.setKind(Kind);
  534. BufferPtr = TokEnd;
  535. }
  536. /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
  537. /// tok::l_paren token, 0 if it is something else and 2 if there are no more
  538. /// tokens in the buffer controlled by this lexer.
  539. unsigned isNextPPTokenLParen();
  540. //===--------------------------------------------------------------------===//
  541. // Lexer character reading interfaces.
  542. // This lexer is built on two interfaces for reading characters, both of which
  543. // automatically provide phase 1/2 translation. getAndAdvanceChar is used
  544. // when we know that we will be reading a character from the input buffer and
  545. // that this character will be part of the result token. This occurs in (f.e.)
  546. // string processing, because we know we need to read until we find the
  547. // closing '"' character.
  548. //
  549. // The second interface is the combination of getCharAndSize with
  550. // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
  551. // returning it and its size. If the lexer decides that this character is
  552. // part of the current token, it calls ConsumeChar on it. This two stage
  553. // approach allows us to emit diagnostics for characters (e.g. warnings about
  554. // trigraphs), knowing that they only are emitted if the character is
  555. // consumed.
  556. /// isObviouslySimpleCharacter - Return true if the specified character is
  557. /// obviously the same in translation phase 1 and translation phase 3. This
  558. /// can return false for characters that end up being the same, but it will
  559. /// never return true for something that needs to be mapped.
  560. static bool isObviouslySimpleCharacter(char C) {
  561. return C != '?' && C != '\\';
  562. }
  563. /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
  564. /// advance over it, and return it. This is tricky in several cases. Here we
  565. /// just handle the trivial case and fall-back to the non-inlined
  566. /// getCharAndSizeSlow method to handle the hard case.
  567. inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
  568. // If this is not a trigraph and not a UCN or escaped newline, return
  569. // quickly.
  570. if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
  571. unsigned Size = 0;
  572. char C = getCharAndSizeSlow(Ptr, Size, &Tok);
  573. Ptr += Size;
  574. return C;
  575. }
  576. /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
  577. /// and added to a given token, check to see if there are diagnostics that
  578. /// need to be emitted or flags that need to be set on the token. If so, do
  579. /// it.
  580. const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
  581. // Normal case, we consumed exactly one token. Just return it.
  582. if (Size == 1)
  583. return Ptr+Size;
  584. // Otherwise, re-lex the character with a current token, allowing
  585. // diagnostics to be emitted and flags to be set.
  586. Size = 0;
  587. getCharAndSizeSlow(Ptr, Size, &Tok);
  588. return Ptr+Size;
  589. }
  590. /// getCharAndSize - Peek a single 'character' from the specified buffer,
  591. /// get its size, and return it. This is tricky in several cases. Here we
  592. /// just handle the trivial case and fall-back to the non-inlined
  593. /// getCharAndSizeSlow method to handle the hard case.
  594. inline char getCharAndSize(const char *Ptr, unsigned &Size) {
  595. // If this is not a trigraph and not a UCN or escaped newline, return
  596. // quickly.
  597. if (isObviouslySimpleCharacter(Ptr[0])) {
  598. Size = 1;
  599. return *Ptr;
  600. }
  601. Size = 0;
  602. return getCharAndSizeSlow(Ptr, Size);
  603. }
  604. /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
  605. /// method.
  606. char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
  607. Token *Tok = nullptr);
  608. /// getEscapedNewLineSize - Return the size of the specified escaped newline,
  609. /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
  610. /// to this function.
  611. static unsigned getEscapedNewLineSize(const char *P);
  612. /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
  613. /// them), skip over them and return the first non-escaped-newline found,
  614. /// otherwise return P.
  615. static const char *SkipEscapedNewLines(const char *P);
  616. /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
  617. /// diagnostic.
  618. static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
  619. const LangOptions &LangOpts);
  620. //===--------------------------------------------------------------------===//
  621. // Other lexer functions.
  622. void SetByteOffset(unsigned Offset, bool StartOfLine);
  623. void PropagateLineStartLeadingSpaceInfo(Token &Result);
  624. const char *LexUDSuffix(Token &Result, const char *CurPtr,
  625. bool IsStringLiteral);
  626. // Helper functions to lex the remainder of a token of the specific type.
  627. // This function handles both ASCII and Unicode identifiers after
  628. // the first codepoint of the identifyier has been parsed.
  629. bool LexIdentifierContinue(Token &Result, const char *CurPtr);
  630. bool LexNumericConstant (Token &Result, const char *CurPtr);
  631. bool LexStringLiteral (Token &Result, const char *CurPtr,
  632. tok::TokenKind Kind);
  633. bool LexRawStringLiteral (Token &Result, const char *CurPtr,
  634. tok::TokenKind Kind);
  635. bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
  636. bool LexCharConstant (Token &Result, const char *CurPtr,
  637. tok::TokenKind Kind);
  638. bool LexEndOfFile (Token &Result, const char *CurPtr);
  639. bool SkipWhitespace (Token &Result, const char *CurPtr,
  640. bool &TokAtPhysicalStartOfLine);
  641. bool SkipLineComment (Token &Result, const char *CurPtr,
  642. bool &TokAtPhysicalStartOfLine);
  643. bool SkipBlockComment (Token &Result, const char *CurPtr,
  644. bool &TokAtPhysicalStartOfLine);
  645. bool SaveLineComment (Token &Result, const char *CurPtr);
  646. bool IsStartOfConflictMarker(const char *CurPtr);
  647. bool HandleEndOfConflictMarker(const char *CurPtr);
  648. bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
  649. bool isCodeCompletionPoint(const char *CurPtr) const;
  650. void cutOffLexing() { BufferPtr = BufferEnd; }
  651. bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
  652. void codeCompleteIncludedFile(const char *PathStart,
  653. const char *CompletionPoint, bool IsAngled);
  654. std::optional<uint32_t>
  655. tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
  656. std::optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
  657. const char *SlashLoc, Token *Result);
  658. /// Read a universal character name.
  659. ///
  660. /// \param StartPtr The position in the source buffer after the initial '\'.
  661. /// If the UCN is syntactically well-formed (but not
  662. /// necessarily valid), this parameter will be updated to
  663. /// point to the character after the UCN.
  664. /// \param SlashLoc The position in the source buffer of the '\'.
  665. /// \param Result The token being formed. Pass \c nullptr to suppress
  666. /// diagnostics and handle token formation in the caller.
  667. ///
  668. /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
  669. /// invalid.
  670. uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
  671. /// Try to consume a UCN as part of an identifier at the current
  672. /// location.
  673. /// \param CurPtr Initially points to the range of characters in the source
  674. /// buffer containing the '\'. Updated to point past the end of
  675. /// the UCN on success.
  676. /// \param Size The number of characters occupied by the '\' (including
  677. /// trigraphs and escaped newlines).
  678. /// \param Result The token being produced. Marked as containing a UCN on
  679. /// success.
  680. /// \return \c true if a UCN was lexed and it produced an acceptable
  681. /// identifier character, \c false otherwise.
  682. bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
  683. Token &Result);
  684. /// Try to consume an identifier character encoded in UTF-8.
  685. /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
  686. /// sequence. On success, updated to point past the end of it.
  687. /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
  688. /// character was lexed, \c false otherwise.
  689. bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
  690. };
  691. } // namespace clang
  692. #endif // LLVM_CLANG_LEX_LEXER_H
  693. #ifdef __GNUC__
  694. #pragma GCC diagnostic pop
  695. #endif