Token.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===--- Token.h - Token interface ------------------------------*- C++ -*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // This file defines the Token interface.
  15. //
  16. //===----------------------------------------------------------------------===//
  17. #ifndef LLVM_CLANG_LEX_TOKEN_H
  18. #define LLVM_CLANG_LEX_TOKEN_H
  19. #include "clang/Basic/SourceLocation.h"
  20. #include "clang/Basic/TokenKinds.h"
  21. #include "llvm/ADT/ArrayRef.h"
  22. #include "llvm/ADT/StringRef.h"
  23. #include <cassert>
  24. namespace clang {
  25. class IdentifierInfo;
  26. /// Token - This structure provides full information about a lexed token.
  27. /// It is not intended to be space efficient, it is intended to return as much
  28. /// information as possible about each returned token. This is expected to be
  29. /// compressed into a smaller form if memory footprint is important.
  30. ///
  31. /// The parser can create a special "annotation token" representing a stream of
  32. /// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
  33. /// can be represented by a single typename annotation token that carries
  34. /// information about the SourceRange of the tokens and the type object.
  35. class Token {
  36. /// The location of the token. This is actually a SourceLocation.
  37. SourceLocation::UIntTy Loc;
  38. // Conceptually these next two fields could be in a union. However, this
  39. // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
  40. // routine. Keeping as separate members with casts until a more beautiful fix
  41. // presents itself.
  42. /// UintData - This holds either the length of the token text, when
  43. /// a normal token, or the end of the SourceRange when an annotation
  44. /// token.
  45. SourceLocation::UIntTy UintData;
  46. /// PtrData - This is a union of four different pointer types, which depends
  47. /// on what type of token this is:
  48. /// Identifiers, keywords, etc:
  49. /// This is an IdentifierInfo*, which contains the uniqued identifier
  50. /// spelling.
  51. /// Literals: isLiteral() returns true.
  52. /// This is a pointer to the start of the token in a text buffer, which
  53. /// may be dirty (have trigraphs / escaped newlines).
  54. /// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
  55. /// This is a pointer to sema-specific data for the annotation token.
  56. /// Eof:
  57. // This is a pointer to a Decl.
  58. /// Other:
  59. /// This is null.
  60. void *PtrData;
  61. /// Kind - The actual flavor of token this is.
  62. tok::TokenKind Kind;
  63. /// Flags - Bits we track about this token, members of the TokenFlags enum.
  64. unsigned short Flags;
  65. public:
  66. // Various flags set per token:
  67. enum TokenFlags {
  68. StartOfLine = 0x01, // At start of line or only after whitespace
  69. // (considering the line after macro expansion).
  70. LeadingSpace = 0x02, // Whitespace exists before this token (considering
  71. // whitespace after macro expansion).
  72. DisableExpand = 0x04, // This identifier may never be macro expanded.
  73. NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
  74. LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
  75. HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
  76. HasUCN = 0x40, // This identifier contains a UCN.
  77. IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
  78. StringifiedInMacro = 0x100, // This string or character literal is formed by
  79. // macro stringizing or charizing operator.
  80. CommaAfterElided = 0x200, // The comma following this token was elided (MS).
  81. IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
  82. IsReinjected = 0x800, // A phase 4 token that was produced before and
  83. // re-added, e.g. via EnterTokenStream. Annotation
  84. // tokens are *not* reinjected.
  85. };
  86. tok::TokenKind getKind() const { return Kind; }
  87. void setKind(tok::TokenKind K) { Kind = K; }
  88. /// is/isNot - Predicates to check if this token is a specific kind, as in
  89. /// "if (Tok.is(tok::l_brace)) {...}".
  90. bool is(tok::TokenKind K) const { return Kind == K; }
  91. bool isNot(tok::TokenKind K) const { return Kind != K; }
  92. bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
  93. return is(K1) || is(K2);
  94. }
  95. template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const {
  96. return is(K1) || isOneOf(Ks...);
  97. }
  98. /// Return true if this is a raw identifier (when lexing
  99. /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
  100. bool isAnyIdentifier() const {
  101. return tok::isAnyIdentifier(getKind());
  102. }
  103. /// Return true if this is a "literal", like a numeric
  104. /// constant, string, etc.
  105. bool isLiteral() const {
  106. return tok::isLiteral(getKind());
  107. }
  108. /// Return true if this is any of tok::annot_* kind tokens.
  109. bool isAnnotation() const {
  110. return tok::isAnnotation(getKind());
  111. }
  112. /// Return a source location identifier for the specified
  113. /// offset in the current file.
  114. SourceLocation getLocation() const {
  115. return SourceLocation::getFromRawEncoding(Loc);
  116. }
  117. unsigned getLength() const {
  118. assert(!isAnnotation() && "Annotation tokens have no length field");
  119. return UintData;
  120. }
  121. void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
  122. void setLength(unsigned Len) {
  123. assert(!isAnnotation() && "Annotation tokens have no length field");
  124. UintData = Len;
  125. }
  126. SourceLocation getAnnotationEndLoc() const {
  127. assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token");
  128. return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
  129. }
  130. void setAnnotationEndLoc(SourceLocation L) {
  131. assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token");
  132. UintData = L.getRawEncoding();
  133. }
  134. SourceLocation getLastLoc() const {
  135. return isAnnotation() ? getAnnotationEndLoc() : getLocation();
  136. }
  137. SourceLocation getEndLoc() const {
  138. return isAnnotation() ? getAnnotationEndLoc()
  139. : getLocation().getLocWithOffset(getLength());
  140. }
  141. /// SourceRange of the group of tokens that this annotation token
  142. /// represents.
  143. SourceRange getAnnotationRange() const {
  144. return SourceRange(getLocation(), getAnnotationEndLoc());
  145. }
  146. void setAnnotationRange(SourceRange R) {
  147. setLocation(R.getBegin());
  148. setAnnotationEndLoc(R.getEnd());
  149. }
  150. const char *getName() const { return tok::getTokenName(Kind); }
  151. /// Reset all flags to cleared.
  152. void startToken() {
  153. Kind = tok::unknown;
  154. Flags = 0;
  155. PtrData = nullptr;
  156. UintData = 0;
  157. Loc = SourceLocation().getRawEncoding();
  158. }
  159. bool hasPtrData() const { return PtrData != nullptr; }
  160. IdentifierInfo *getIdentifierInfo() const {
  161. assert(isNot(tok::raw_identifier) &&
  162. "getIdentifierInfo() on a tok::raw_identifier token!");
  163. assert(!isAnnotation() &&
  164. "getIdentifierInfo() on an annotation token!");
  165. if (isLiteral()) return nullptr;
  166. if (is(tok::eof)) return nullptr;
  167. return (IdentifierInfo*) PtrData;
  168. }
  169. void setIdentifierInfo(IdentifierInfo *II) {
  170. PtrData = (void*) II;
  171. }
  172. const void *getEofData() const {
  173. assert(is(tok::eof));
  174. return reinterpret_cast<const void *>(PtrData);
  175. }
  176. void setEofData(const void *D) {
  177. assert(is(tok::eof));
  178. assert(!PtrData);
  179. PtrData = const_cast<void *>(D);
  180. }
  181. /// getRawIdentifier - For a raw identifier token (i.e., an identifier
  182. /// lexed in raw mode), returns a reference to the text substring in the
  183. /// buffer if known.
  184. StringRef getRawIdentifier() const {
  185. assert(is(tok::raw_identifier));
  186. return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
  187. }
  188. void setRawIdentifierData(const char *Ptr) {
  189. assert(is(tok::raw_identifier));
  190. PtrData = const_cast<char*>(Ptr);
  191. }
  192. /// getLiteralData - For a literal token (numeric constant, string, etc), this
  193. /// returns a pointer to the start of it in the text buffer if known, null
  194. /// otherwise.
  195. const char *getLiteralData() const {
  196. assert(isLiteral() && "Cannot get literal data of non-literal");
  197. return reinterpret_cast<const char*>(PtrData);
  198. }
  199. void setLiteralData(const char *Ptr) {
  200. assert(isLiteral() && "Cannot set literal data of non-literal");
  201. PtrData = const_cast<char*>(Ptr);
  202. }
  203. void *getAnnotationValue() const {
  204. assert(isAnnotation() && "Used AnnotVal on non-annotation token");
  205. return PtrData;
  206. }
  207. void setAnnotationValue(void *val) {
  208. assert(isAnnotation() && "Used AnnotVal on non-annotation token");
  209. PtrData = val;
  210. }
  211. /// Set the specified flag.
  212. void setFlag(TokenFlags Flag) {
  213. Flags |= Flag;
  214. }
  215. /// Get the specified flag.
  216. bool getFlag(TokenFlags Flag) const {
  217. return (Flags & Flag) != 0;
  218. }
  219. /// Unset the specified flag.
  220. void clearFlag(TokenFlags Flag) {
  221. Flags &= ~Flag;
  222. }
  223. /// Return the internal represtation of the flags.
  224. ///
  225. /// This is only intended for low-level operations such as writing tokens to
  226. /// disk.
  227. unsigned getFlags() const {
  228. return Flags;
  229. }
  230. /// Set a flag to either true or false.
  231. void setFlagValue(TokenFlags Flag, bool Val) {
  232. if (Val)
  233. setFlag(Flag);
  234. else
  235. clearFlag(Flag);
  236. }
  237. /// isAtStartOfLine - Return true if this token is at the start of a line.
  238. ///
  239. bool isAtStartOfLine() const { return getFlag(StartOfLine); }
  240. /// Return true if this token has whitespace before it.
  241. ///
  242. bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
  243. /// Return true if this identifier token should never
  244. /// be expanded in the future, due to C99 6.10.3.4p2.
  245. bool isExpandDisabled() const { return getFlag(DisableExpand); }
  246. /// Return true if we have an ObjC keyword identifier.
  247. bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
  248. /// Return the ObjC keyword kind.
  249. tok::ObjCKeywordKind getObjCKeywordID() const;
  250. /// Return true if this token has trigraphs or escaped newlines in it.
  251. bool needsCleaning() const { return getFlag(NeedsCleaning); }
  252. /// Return true if this token has an empty macro before it.
  253. ///
  254. bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
  255. /// Return true if this token is a string or character literal which
  256. /// has a ud-suffix.
  257. bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
  258. /// Returns true if this token contains a universal character name.
  259. bool hasUCN() const { return getFlag(HasUCN); }
  260. /// Returns true if this token is formed by macro by stringizing or charizing
  261. /// operator.
  262. bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
  263. /// Returns true if the comma after this token was elided.
  264. bool commaAfterElided() const { return getFlag(CommaAfterElided); }
  265. /// Returns true if this token is an editor placeholder.
  266. ///
  267. /// Editor placeholders are produced by the code-completion engine and are
  268. /// represented as characters between '<#' and '#>' in the source code. The
  269. /// lexer uses identifier tokens to represent placeholders.
  270. bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
  271. };
  272. /// Information about the conditional stack (\#if directives)
  273. /// currently active.
  274. struct PPConditionalInfo {
  275. /// Location where the conditional started.
  276. SourceLocation IfLoc;
  277. /// True if this was contained in a skipping directive, e.g.,
  278. /// in a "\#if 0" block.
  279. bool WasSkipping;
  280. /// True if we have emitted tokens already, and now we're in
  281. /// an \#else block or something. Only useful in Skipping blocks.
  282. bool FoundNonSkip;
  283. /// True if we've seen a \#else in this block. If so,
  284. /// \#elif/\#else directives are not allowed.
  285. bool FoundElse;
  286. };
  287. // Extra information needed for annonation tokens.
  288. struct PragmaLoopHintInfo {
  289. Token PragmaName;
  290. Token Option;
  291. ArrayRef<Token> Toks;
  292. };
  293. } // end namespace clang
  294. #endif // LLVM_CLANG_LEX_TOKEN_H
  295. #ifdef __GNUC__
  296. #pragma GCC diagnostic pop
  297. #endif