TGLexer.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This class represents the Lexer for tablegen files.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
  13. #define LLVM_LIB_TABLEGEN_TGLEXER_H
  14. #include "llvm/ADT/StringRef.h"
  15. #include "llvm/ADT/StringSet.h"
  16. #include "llvm/Support/DataTypes.h"
  17. #include "llvm/Support/SMLoc.h"
  18. #include <cassert>
  19. #include <memory>
  20. #include <set>
  21. #include <string>
  22. #include <vector>
  23. namespace llvm {
  24. template <typename T> class ArrayRef;
  25. class SourceMgr;
  26. class Twine;
  27. namespace tgtok {
  28. enum TokKind {
  29. // Markers
  30. Eof, Error,
  31. // Tokens with no info.
  32. minus, plus, // - +
  33. l_square, r_square, // [ ]
  34. l_brace, r_brace, // { }
  35. l_paren, r_paren, // ( )
  36. less, greater, // < >
  37. colon, semi, // : ;
  38. comma, dot, // , .
  39. equal, question, // = ?
  40. paste, // #
  41. dotdotdot, // ...
  42. // Reserved keywords. ('ElseKW' is named to distinguish it from the
  43. // existing 'Else' that means the preprocessor #else.)
  44. Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW,
  45. FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass,
  46. String, Then, TrueKW,
  47. // Bang operators.
  48. XConcat, XADD, XSUB, XMUL, XDIV, XNOT, XLOG2, XAND, XOR, XXOR, XSRA, XSRL,
  49. XSHL, XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind,
  50. XCast, XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf,
  51. XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp,
  52. XExists, XListRemove,
  53. // Boolean literals.
  54. TrueVal, FalseVal,
  55. // Integer value.
  56. IntVal,
  57. // Binary constant. Note that these are sized according to the number of
  58. // bits given.
  59. BinaryIntVal,
  60. // String valued tokens.
  61. Id, StrVal, VarName, CodeFragment,
  62. // Preprocessing tokens for internal usage by the lexer.
  63. // They are never returned as a result of Lex().
  64. Ifdef, Ifndef, Else, Endif, Define
  65. };
  66. }
  67. /// TGLexer - TableGen Lexer class.
  68. class TGLexer {
  69. SourceMgr &SrcMgr;
  70. const char *CurPtr = nullptr;
  71. StringRef CurBuf;
  72. // Information about the current token.
  73. const char *TokStart = nullptr;
  74. tgtok::TokKind CurCode = tgtok::TokKind::Eof;
  75. std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
  76. int64_t CurIntVal = 0; // This is valid for IntVal.
  77. /// CurBuffer - This is the current buffer index we're lexing from as managed
  78. /// by the SourceMgr object.
  79. unsigned CurBuffer = 0;
  80. public:
  81. typedef std::set<std::string> DependenciesSetTy;
  82. private:
  83. /// Dependencies - This is the list of all included files.
  84. DependenciesSetTy Dependencies;
  85. public:
  86. TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
  87. tgtok::TokKind Lex() {
  88. return CurCode = LexToken(CurPtr == CurBuf.begin());
  89. }
  90. const DependenciesSetTy &getDependencies() const {
  91. return Dependencies;
  92. }
  93. tgtok::TokKind getCode() const { return CurCode; }
  94. const std::string &getCurStrVal() const {
  95. assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
  96. CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
  97. "This token doesn't have a string value");
  98. return CurStrVal;
  99. }
  100. int64_t getCurIntVal() const {
  101. assert(CurCode == tgtok::IntVal && "This token isn't an integer");
  102. return CurIntVal;
  103. }
  104. std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
  105. assert(CurCode == tgtok::BinaryIntVal &&
  106. "This token isn't a binary integer");
  107. return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
  108. }
  109. SMLoc getLoc() const;
  110. SMRange getLocRange() const;
  111. private:
  112. /// LexToken - Read the next token and return its code.
  113. tgtok::TokKind LexToken(bool FileOrLineStart = false);
  114. tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
  115. tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
  116. int getNextChar();
  117. int peekNextChar(int Index) const;
  118. void SkipBCPLComment();
  119. bool SkipCComment();
  120. tgtok::TokKind LexIdentifier();
  121. bool LexInclude();
  122. tgtok::TokKind LexString();
  123. tgtok::TokKind LexVarName();
  124. tgtok::TokKind LexNumber();
  125. tgtok::TokKind LexBracket();
  126. tgtok::TokKind LexExclaim();
  127. // Process EOF encountered in LexToken().
  128. // If EOF is met in an include file, then the method will update
  129. // CurPtr, CurBuf and preprocessing include stack, and return true.
  130. // If EOF is met in the top-level file, then the method will
  131. // update and check the preprocessing include stack, and return false.
  132. bool processEOF();
  133. // *** Structures and methods for preprocessing support ***
  134. // A set of macro names that are defined either via command line or
  135. // by using:
  136. // #define NAME
  137. StringSet<> DefinedMacros;
  138. // Each of #ifdef and #else directives has a descriptor associated
  139. // with it.
  140. //
  141. // An ordered list of preprocessing controls defined by #ifdef/#else
  142. // directives that are in effect currently is called preprocessing
  143. // control stack. It is represented as a vector of PreprocessorControlDesc's.
  144. //
  145. // The control stack is updated according to the following rules:
  146. //
  147. // For each #ifdef we add an element to the control stack.
  148. // For each #else we replace the top element with a descriptor
  149. // with an inverted IsDefined value.
  150. // For each #endif we pop the top element from the control stack.
  151. //
  152. // When CurPtr reaches the current buffer's end, the control stack
  153. // must be empty, i.e. #ifdef and the corresponding #endif
  154. // must be located in the same file.
  155. struct PreprocessorControlDesc {
  156. // Either tgtok::Ifdef or tgtok::Else.
  157. tgtok::TokKind Kind;
  158. // True, if the condition for this directive is true, false - otherwise.
  159. // Examples:
  160. // #ifdef NAME : true, if NAME is defined, false - otherwise.
  161. // ...
  162. // #else : false, if NAME is defined, true - otherwise.
  163. bool IsDefined;
  164. // Pointer into CurBuf to the beginning of the preprocessing directive
  165. // word, e.g.:
  166. // #ifdef NAME
  167. // ^ - SrcPos
  168. SMLoc SrcPos;
  169. };
  170. // We want to disallow code like this:
  171. // file1.td:
  172. // #define NAME
  173. // #ifdef NAME
  174. // include "file2.td"
  175. // EOF
  176. // file2.td:
  177. // #endif
  178. // EOF
  179. //
  180. // To do this, we clear the preprocessing control stack on entry
  181. // to each of the included file. PrepIncludeStack is used to store
  182. // preprocessing control stacks for the current file and all its
  183. // parent files. The back() element is the preprocessing control
  184. // stack for the current file.
  185. std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
  186. PrepIncludeStack;
  187. // Validate that the current preprocessing control stack is empty,
  188. // since we are about to exit a file, and pop the include stack.
  189. //
  190. // If IncludeStackMustBeEmpty is true, the include stack must be empty
  191. // after the popping, otherwise, the include stack must not be empty
  192. // after the popping. Basically, the include stack must be empty
  193. // only if we exit the "top-level" file (i.e. finish lexing).
  194. //
  195. // The method returns false, if the current preprocessing control stack
  196. // is not empty (e.g. there is an unterminated #ifdef/#else),
  197. // true - otherwise.
  198. bool prepExitInclude(bool IncludeStackMustBeEmpty);
  199. // Look ahead for a preprocessing directive starting from CurPtr. The caller
  200. // must only call this method, if *(CurPtr - 1) is '#'. If the method matches
  201. // a preprocessing directive word followed by a whitespace, then it returns
  202. // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
  203. //
  204. // CurPtr is not adjusted by this method.
  205. tgtok::TokKind prepIsDirective() const;
  206. // Given a preprocessing token kind, adjusts CurPtr to the end
  207. // of the preprocessing directive word. Returns true, unless
  208. // an unsupported token kind is passed in.
  209. //
  210. // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
  211. // to avoid adjusting CurPtr before we are sure that '#' is followed
  212. // by a preprocessing directive. If it is not, then we fall back to
  213. // tgtok::paste interpretation of '#'.
  214. bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
  215. // The main "exit" point from the token parsing to preprocessor.
  216. //
  217. // The method is called for CurPtr, when prepIsDirective() returns
  218. // true. The first parameter matches the result of prepIsDirective(),
  219. // denoting the actual preprocessor directive to be processed.
  220. //
  221. // If the preprocessing directive disables the tokens processing, e.g.:
  222. // #ifdef NAME // NAME is undefined
  223. // then lexPreprocessor() enters the lines-skipping mode.
  224. // In this mode, it does not parse any tokens, because the code under
  225. // the #ifdef may not even be a correct tablegen code. The preprocessor
  226. // looks for lines containing other preprocessing directives, which
  227. // may be prepended with whitespaces and C-style comments. If the line
  228. // does not contain a preprocessing directive, it is skipped completely.
  229. // Otherwise, the preprocessing directive is processed by recursively
  230. // calling lexPreprocessor(). The processing of the encountered
  231. // preprocessing directives includes updating preprocessing control stack
  232. // and adding new macros into DefinedMacros set.
  233. //
  234. // The second parameter controls whether lexPreprocessor() is called from
  235. // LexToken() (true) or recursively from lexPreprocessor() (false).
  236. //
  237. // If ReturnNextLiveToken is true, the method returns the next
  238. // LEX token following the current directive or following the end
  239. // of the disabled preprocessing region corresponding to this directive.
  240. // If ReturnNextLiveToken is false, the method returns the first parameter,
  241. // unless there were errors encountered in the disabled preprocessing
  242. // region - in this case, it returns tgtok::Error.
  243. tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
  244. bool ReturnNextLiveToken = true);
  245. // Worker method for lexPreprocessor() to skip lines after some
  246. // preprocessing directive up to the buffer end or to the directive
  247. // that re-enables token processing. The method returns true
  248. // upon processing the next directive that re-enables tokens
  249. // processing. False is returned if an error was encountered.
  250. //
  251. // Note that prepSkipRegion() calls lexPreprocessor() to process
  252. // encountered preprocessing directives. In this case, the second
  253. // parameter to lexPreprocessor() is set to false. Being passed
  254. // false ReturnNextLiveToken, lexPreprocessor() must never call
  255. // prepSkipRegion(). We assert this by passing ReturnNextLiveToken
  256. // to prepSkipRegion() and checking that it is never set to false.
  257. bool prepSkipRegion(bool MustNeverBeFalse);
  258. // Lex name of the macro after either #ifdef or #define. We could have used
  259. // LexIdentifier(), but it has special handling of "include" word, which
  260. // could result in awkward diagnostic errors. Consider:
  261. // ----
  262. // #ifdef include
  263. // class ...
  264. // ----
  265. // LexIdentifier() will engage LexInclude(), which will complain about
  266. // missing file with name "class". Instead, prepLexMacroName() will treat
  267. // "include" as a normal macro name.
  268. //
  269. // On entry, CurPtr points to the end of a preprocessing directive word.
  270. // The method allows for whitespaces between the preprocessing directive
  271. // and the macro name. The allowed whitespaces are ' ' and '\t'.
  272. //
  273. // If the first non-whitespace symbol after the preprocessing directive
  274. // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
  275. // the method updates TokStart to the position of the first non-whitespace
  276. // symbol, sets CurPtr to the position of the macro name's last symbol,
  277. // and returns a string reference to the macro name. Otherwise,
  278. // TokStart is set to the first non-whitespace symbol after the preprocessing
  279. // directive, and the method returns an empty string reference.
  280. //
  281. // In all cases, TokStart may be used to point to the word following
  282. // the preprocessing directive.
  283. StringRef prepLexMacroName();
  284. // Skip any whitespaces starting from CurPtr. The method is used
  285. // only in the lines-skipping mode to find the first non-whitespace
  286. // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
  287. // and '\r'. The method skips C-style comments as well, because
  288. // it is used to find the beginning of the preprocessing directive.
  289. // If we do not handle C-style comments the following code would
  290. // result in incorrect detection of a preprocessing directive:
  291. // /*
  292. // #ifdef NAME
  293. // */
  294. // As long as we skip C-style comments, the following code is correctly
  295. // recognized as a preprocessing directive:
  296. // /* first line comment
  297. // second line comment */ #ifdef NAME
  298. //
  299. // The method returns true upon reaching the first non-whitespace symbol
  300. // or EOF, CurPtr is set to point to this symbol. The method returns false,
  301. // if an error occurred during skipping of a C-style comment.
  302. bool prepSkipLineBegin();
  303. // Skip any whitespaces or comments after a preprocessing directive.
  304. // The method returns true upon reaching either end of the line
  305. // or end of the file. If there is a multiline C-style comment
  306. // after the preprocessing directive, the method skips
  307. // the comment, so the final CurPtr may point to one of the next lines.
  308. // The method returns false, if an error occurred during skipping
  309. // C- or C++-style comment, or a non-whitespace symbol appears
  310. // after the preprocessing directive.
  311. //
  312. // The method maybe called both during lines-skipping and tokens
  313. // processing. It actually verifies that only whitespaces or/and
  314. // comments follow a preprocessing directive.
  315. //
  316. // After the execution of this mehod, CurPtr points either to new line
  317. // symbol, buffer end or non-whitespace symbol following the preprocesing
  318. // directive.
  319. bool prepSkipDirectiveEnd();
  320. // Skip all symbols to the end of the line/file.
  321. // The method adjusts CurPtr, so that it points to either new line
  322. // symbol in the current line or the buffer end.
  323. void prepSkipToLineEnd();
  324. // Return true, if the current preprocessor control stack is such that
  325. // we should allow lexer to process the next token, false - otherwise.
  326. //
  327. // In particular, the method returns true, if all the #ifdef/#else
  328. // controls on the stack have their IsDefined member set to true.
  329. bool prepIsProcessingEnabled();
  330. // Report an error, if we reach EOF with non-empty preprocessing control
  331. // stack. This means there is no matching #endif for the previous
  332. // #ifdef/#else.
  333. void prepReportPreprocessorStackError();
  334. };
  335. } // end namespace llvm
  336. #endif