FormatTokenLexer.h 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. //===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. ///
  9. /// \file
  10. /// This file contains FormatTokenLexer, which tokenizes a source file
  11. /// into a token stream suitable for ClangFormat.
  12. ///
  13. //===----------------------------------------------------------------------===//
  14. #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
  15. #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
  16. #include "Encoding.h"
  17. #include "FormatToken.h"
  18. #include "clang/Basic/LangOptions.h"
  19. #include "clang/Basic/SourceLocation.h"
  20. #include "clang/Basic/SourceManager.h"
  21. #include "clang/Format/Format.h"
  22. #include "llvm/ADT/MapVector.h"
  23. #include "llvm/ADT/StringSet.h"
  24. #include "llvm/Support/Regex.h"
  25. #include <stack>
  26. namespace clang {
  27. namespace format {
  28. enum LexerState {
  29. NORMAL,
  30. TEMPLATE_STRING,
  31. TOKEN_STASHED,
  32. };
  33. class FormatTokenLexer {
  34. public:
  35. FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
  36. const FormatStyle &Style, encoding::Encoding Encoding,
  37. llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
  38. IdentifierTable &IdentTable);
  39. ArrayRef<FormatToken *> lex();
  40. const AdditionalKeywords &getKeywords() { return Keywords; }
  41. private:
  42. void tryMergePreviousTokens();
  43. bool tryMergeLessLess();
  44. bool tryMergeNSStringLiteral();
  45. bool tryMergeJSPrivateIdentifier();
  46. bool tryMergeCSharpStringLiteral();
  47. bool tryMergeCSharpKeywordVariables();
  48. bool tryMergeNullishCoalescingEqual();
  49. bool tryTransformCSharpForEach();
  50. bool tryMergeForEach();
  51. bool tryTransformTryUsageForC();
  52. // Merge the most recently lexed tokens into a single token if their kinds are
  53. // correct.
  54. bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
  55. // Merge without checking their kinds.
  56. bool tryMergeTokens(size_t Count, TokenType NewType);
  57. // Merge if their kinds match any one of Kinds.
  58. bool tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds,
  59. TokenType NewType);
  60. // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
  61. bool precedesOperand(FormatToken *Tok);
  62. bool canPrecedeRegexLiteral(FormatToken *Prev);
  63. // Tries to parse a JavaScript Regex literal starting at the current token,
  64. // if that begins with a slash and is in a location where JavaScript allows
  65. // regex literals. Changes the current token to a regex literal and updates
  66. // its text if successful.
  67. void tryParseJSRegexLiteral();
  68. // Handles JavaScript template strings.
  69. //
  70. // JavaScript template strings use backticks ('`') as delimiters, and allow
  71. // embedding expressions nested in ${expr-here}. Template strings can be
  72. // nested recursively, i.e. expressions can contain template strings in turn.
  73. //
  74. // The code below parses starting from a backtick, up to a closing backtick or
  75. // an opening ${. It also maintains a stack of lexing contexts to handle
  76. // nested template parts by balancing curly braces.
  77. void handleTemplateStrings();
  78. void handleCSharpVerbatimAndInterpolatedStrings();
  79. void tryParsePythonComment();
  80. bool tryMerge_TMacro();
  81. bool tryMergeConflictMarkers();
  82. void truncateToken(size_t NewLen);
  83. FormatToken *getStashedToken();
  84. FormatToken *getNextToken();
  85. FormatToken *FormatTok;
  86. bool IsFirstToken;
  87. std::stack<LexerState> StateStack;
  88. unsigned Column;
  89. unsigned TrailingWhitespace;
  90. std::unique_ptr<Lexer> Lex;
  91. LangOptions LangOpts;
  92. const SourceManager &SourceMgr;
  93. FileID ID;
  94. const FormatStyle &Style;
  95. IdentifierTable &IdentTable;
  96. AdditionalKeywords Keywords;
  97. encoding::Encoding Encoding;
  98. llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
  99. // Index (in 'Tokens') of the last token that starts a new line.
  100. unsigned FirstInLineIndex;
  101. SmallVector<FormatToken *, 16> Tokens;
  102. llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;
  103. bool FormattingDisabled;
  104. llvm::Regex MacroBlockBeginRegex;
  105. llvm::Regex MacroBlockEndRegex;
  106. // Targets that may appear inside a C# attribute.
  107. static const llvm::StringSet<> CSharpAttributeTargets;
  108. /// Handle Verilog-specific tokens.
  109. bool readRawTokenVerilogSpecific(Token &Tok);
  110. void readRawToken(FormatToken &Tok);
  111. void resetLexer(unsigned Offset);
  112. };
  113. } // namespace format
  114. } // namespace clang
  115. #endif