Macros.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. ///
  9. /// \file
  10. /// This file contains the main building blocks of macro support in
  11. /// clang-format.
  12. ///
  13. /// In order to not violate the requirement that clang-format can format files
  14. /// in isolation, clang-format's macro support uses expansions users provide
  15. /// as part of clang-format's style configuration.
  16. ///
  17. /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
  18. /// one level of expansion (\see MacroExpander for a full description of what
  19. /// is supported).
  20. ///
  21. /// As part of parsing, clang-format uses the MacroExpander to expand the
  22. /// spelled token streams into expanded token streams when it encounters a
  23. /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
  24. /// from the expanded token stream.
  25. /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
  26. /// matches the spelled token stream into unwrapped lines that best resemble the
  27. /// structure of the expanded unwrapped lines. These reconstructed unwrapped
  28. /// lines are aliasing the tokens in the expanded token stream, so that token
  29. /// annotations will be reused when formatting the spelled macro calls.
  30. ///
  31. /// When formatting, clang-format annotates and formats the expanded unwrapped
  32. /// lines first, determining the token types. Next, it formats the spelled
  33. /// unwrapped lines, keeping the token types fixed, while allowing other
  34. /// formatting decisions to change.
  35. ///
  36. //===----------------------------------------------------------------------===//
  37. #ifndef CLANG_LIB_FORMAT_MACROS_H
  38. #define CLANG_LIB_FORMAT_MACROS_H
  39. #include <list>
  40. #include <map>
  41. #include <string>
  42. #include <vector>
  43. #include "FormatToken.h"
  44. #include "llvm/ADT/ArrayRef.h"
  45. #include "llvm/ADT/DenseMap.h"
  46. #include "llvm/ADT/SmallVector.h"
  47. #include "llvm/ADT/StringRef.h"
  48. namespace clang {
  49. namespace format {
  50. struct UnwrappedLine;
  51. struct UnwrappedLineNode;
  52. /// Takes a set of macro definitions as strings and allows expanding calls to
  53. /// those macros.
  54. ///
  55. /// For example:
  56. /// Definition: A(x, y)=x + y
  57. /// Call : A(int a = 1, 2)
  58. /// Expansion : int a = 1 + 2
  59. ///
  60. /// Expansion does not check arity of the definition.
  61. /// If fewer arguments than expected are provided, the remaining parameters
  62. /// are considered empty:
  63. /// Call : A(a)
  64. /// Expansion: a +
  65. /// If more arguments than expected are provided, they will be discarded.
  66. ///
  67. /// The expander does not support:
  68. /// - recursive expansion
  69. /// - stringification
  70. /// - concatenation
  71. /// - variadic macros
  72. ///
  73. /// Furthermore, only a single expansion of each macro argument is supported,
  74. /// so that we cannot get conflicting formatting decisions from different
  75. /// expansions.
  76. /// Definition: A(x)=x+x
  77. /// Call : A(id)
  78. /// Expansion : id+x
  79. ///
  80. class MacroExpander {
  81. public:
  82. using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>;
  83. /// Construct a macro expander from a set of macro definitions.
  84. /// Macro definitions must be encoded as UTF-8.
  85. ///
  86. /// Each entry in \p Macros must conform to the following simple
  87. /// macro-definition language:
  88. /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
  89. /// <params> ::= <id-list> | ""
  90. /// <id-list> ::= <id> | <id> "," <params>
  91. /// <expansion> ::= "=" <tail> | <eof>
  92. /// <tail> ::= <tok> <tail> | <eof>
  93. ///
  94. /// Macros that cannot be parsed will be silently discarded.
  95. ///
  96. MacroExpander(const std::vector<std::string> &Macros,
  97. clang::SourceManager &SourceMgr, const FormatStyle &Style,
  98. llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
  99. IdentifierTable &IdentTable);
  100. ~MacroExpander();
  101. /// Returns whether a macro \p Name is defined.
  102. bool defined(llvm::StringRef Name) const;
  103. /// Returns whether the macro has no arguments and should not consume
  104. /// subsequent parentheses.
  105. bool objectLike(llvm::StringRef Name) const;
  106. /// Returns the expanded stream of format tokens for \p ID, where
  107. /// each element in \p Args is a positional argument to the macro call.
  108. llvm::SmallVector<FormatToken *, 8> expand(FormatToken *ID,
  109. ArgsList Args) const;
  110. private:
  111. struct Definition;
  112. class DefinitionParser;
  113. void parseDefinition(const std::string &Macro);
  114. clang::SourceManager &SourceMgr;
  115. const FormatStyle &Style;
  116. llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
  117. IdentifierTable &IdentTable;
  118. SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
  119. llvm::StringMap<Definition> Definitions;
  120. };
  121. /// Converts a sequence of UnwrappedLines containing expanded macros into a
  122. /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be
  123. /// broken into child lines, in a way that best conveys the structure of the
  124. /// expanded code.
  125. ///
  126. /// In the simplest case, a spelled UnwrappedLine contains one macro, and after
  127. /// expanding it we have one expanded UnwrappedLine. In general, macro
  128. /// expansions can span UnwrappedLines, and multiple macros can contribute
  129. /// tokens to the same line. We keep consuming expanded lines until:
  130. /// * all expansions that started have finished (we're not chopping any macros
  131. /// in half)
  132. /// * *and* we've reached the end of a *spelled* unwrapped line.
  133. ///
  134. /// A single UnwrappedLine represents this chunk of code.
  135. ///
  136. /// After this point, the state of the spelled/expanded stream is "in sync"
  137. /// (both at the start of an UnwrappedLine, with no macros open), so the
  138. /// Unexpander can be thrown away and parsing can continue.
  139. ///
  140. /// Given a mapping from the macro name identifier token in the macro call
  141. /// to the tokens of the macro call, for example:
  142. /// CLASSA -> CLASSA({public: void x();})
  143. ///
  144. /// When getting the formatted lines of the expansion via the \c addLine method
  145. /// (each '->' specifies a call to \c addLine ):
  146. /// -> class A {
  147. /// -> public:
  148. /// -> void x();
  149. /// -> };
  150. ///
  151. /// Creates the tree of unwrapped lines containing the macro call tokens so that
  152. /// the macro call tokens fit the semantic structure of the expanded formatted
  153. /// lines:
  154. /// -> CLASSA({
  155. /// -> public:
  156. /// -> void x();
  157. /// -> })
  158. class MacroCallReconstructor {
  159. public:
  160. /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
  161. /// \p Level, using the map from name identifier token to the corresponding
  162. /// tokens of the spelled macro call.
  163. MacroCallReconstructor(
  164. unsigned Level,
  165. const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
  166. &ActiveExpansions);
  167. /// For the given \p Line, match all occurences of tokens expanded from a
  168. /// macro to unwrapped lines in the spelled macro call so that the resulting
  169. /// tree of unwrapped lines best resembles the structure of unwrapped lines
  170. /// passed in via \c addLine.
  171. void addLine(const UnwrappedLine &Line);
  172. /// Check whether at the current state there is no open macro expansion
  173. /// that needs to be processed to finish an macro call.
  174. /// Only when \c finished() is true, \c takeResult() can be called to retrieve
  175. /// the resulting \c UnwrappedLine.
  176. /// If there are multiple subsequent macro calls within an unwrapped line in
  177. /// the spelled token stream, the calling code may also continue to call
  178. /// \c addLine() when \c finished() is true.
  179. bool finished() const { return ActiveExpansions.empty(); }
  180. /// Retrieve the formatted \c UnwrappedLine containing the orginal
  181. /// macro calls, formatted according to the expanded token stream received
  182. /// via \c addLine().
  183. /// Generally, this line tries to have the same structure as the expanded,
  184. /// formatted unwrapped lines handed in via \c addLine(), with the exception
  185. /// that for multiple top-level lines, each subsequent line will be the
  186. /// child of the last token in its predecessor. This representation is chosen
  187. /// because it is a precondition to the formatter that we get what looks like
  188. /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
  189. ///
  190. /// If a token in a macro argument is a child of a token in the expansion,
  191. /// the parent will be the corresponding token in the macro call.
  192. /// For example:
  193. /// #define C(a, b) class C { a b
  194. /// C(int x;, int y;)
  195. /// would expand to
  196. /// class C { int x; int y;
  197. /// where in a formatted line "int x;" and "int y;" would both be new separate
  198. /// lines.
  199. ///
  200. /// In the result, "int x;" will be a child of the opening parenthesis in "C("
  201. /// and "int y;" will be a child of the "," token:
  202. /// C (
  203. /// \- int x;
  204. /// ,
  205. /// \- int y;
  206. /// )
  207. UnwrappedLine takeResult() &&;
  208. private:
  209. void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
  210. void prepareParent(FormatToken *ExpandedParent, bool First);
  211. FormatToken *getParentInResult(FormatToken *Parent);
  212. void reconstruct(FormatToken *Token);
  213. void startReconstruction(FormatToken *Token);
  214. bool reconstructActiveCallUntil(FormatToken *Token);
  215. void endReconstruction(FormatToken *Token);
  216. bool processNextReconstructed();
  217. void finalize();
  218. struct ReconstructedLine;
  219. void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
  220. UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
  221. void debug(const ReconstructedLine &Line, int Level);
  222. ReconstructedLine &parentLine();
  223. ReconstructedLine *currentLine();
  224. void debugParentMap() const;
  225. #ifndef NDEBUG
  226. enum ReconstructorState {
  227. Start, // No macro expansion was found in the input yet.
  228. InProgress, // During a macro reconstruction.
  229. Finalized, // Past macro reconstruction, the result is finalized.
  230. };
  231. ReconstructorState State = Start;
  232. #endif
  233. // Node in which we build up the resulting unwrapped line; this type is
  234. // analogous to UnwrappedLineNode.
  235. struct LineNode {
  236. LineNode() = default;
  237. LineNode(FormatToken *Tok) : Tok(Tok) {}
  238. FormatToken *Tok = nullptr;
  239. llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children;
  240. };
  241. // Line in which we build up the resulting unwrapped line.
  242. // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
  243. // instead of rolling our own type.
  244. struct ReconstructedLine {
  245. llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
  246. };
  247. // The line in which we collect the resulting reconstructed output.
  248. // To reduce special cases in the algorithm, the first level of the line
  249. // contains a single null token that has the reconstructed incoming
  250. // lines as children.
  251. // In the end, we stich the lines together so that each subsequent line
  252. // is a child of the last token of the previous line. This is necessary
  253. // in order to format the overall expression as a single logical line -
  254. // if we created separate lines, we'd format them with their own top-level
  255. // indent depending on the semantic structure, which is not desired.
  256. ReconstructedLine Result;
  257. // Stack of currently "open" lines, where each line's predecessor's last
  258. // token is the parent token for that line.
  259. llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines;
  260. // Maps from the expanded token to the token that takes its place in the
  261. // reconstructed token stream in terms of parent-child relationships.
  262. // Note that it might take multiple steps to arrive at the correct
  263. // parent in the output.
  264. // Given: #define C(a, b) []() { a; b; }
  265. // And a call: C(f(), g())
  266. // The structure in the incoming formatted unwrapped line will be:
  267. // []() {
  268. // |- f();
  269. // \- g();
  270. // }
  271. // with f and g being children of the opening brace.
  272. // In the reconstructed call:
  273. // C(f(), g())
  274. // \- f()
  275. // \- g()
  276. // We want f to be a child of the opening parenthesis and g to be a child
  277. // of the comma token in the macro call.
  278. // Thus, we map
  279. // { -> (
  280. // and add
  281. // ( -> ,
  282. // once we're past the comma in the reconstruction.
  283. llvm::DenseMap<FormatToken *, FormatToken *>
  284. SpelledParentToReconstructedParent;
  285. // Keeps track of a single expansion while we're reconstructing tokens it
  286. // generated.
  287. struct Expansion {
  288. // The identifier token of the macro call.
  289. FormatToken *ID;
  290. // Our current position in the reconstruction.
  291. std::list<UnwrappedLineNode>::iterator SpelledI;
  292. // The end of the reconstructed token sequence.
  293. std::list<UnwrappedLineNode>::iterator SpelledE;
  294. };
  295. // Stack of macro calls for which we're in the middle of an expansion.
  296. llvm::SmallVector<Expansion> ActiveExpansions;
  297. struct MacroCallState {
  298. MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
  299. FormatToken *MacroCallLParen);
  300. ReconstructedLine *Line;
  301. // The last token in the parent line or expansion, or nullptr if the macro
  302. // expansion is on a top-level line.
  303. //
  304. // For example, in the macro call:
  305. // auto f = []() { ID(1); };
  306. // The MacroCallState for ID will have '{' as ParentLastToken.
  307. //
  308. // In the macro call:
  309. // ID(ID(void f()));
  310. // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
  311. // while the MacroCallState for the inner ID will have the '(' of the outer
  312. // ID as ParentLastToken.
  313. //
  314. // In the macro call:
  315. // ID2(a, ID(b));
  316. // The MacroCallState of ID will have ',' as ParentLastToken.
  317. FormatToken *ParentLastToken;
  318. // The l_paren of this MacroCallState's macro call.
  319. FormatToken *MacroCallLParen;
  320. };
  321. // Keeps track of the lines into which the opening brace/parenthesis &
  322. // argument separating commas for each level in the macro call go in order to
  323. // put the corresponding closing brace/parenthesis into the same line in the
  324. // output and keep track of which parents in the expanded token stream map to
  325. // which tokens in the reconstructed stream.
  326. // When an opening brace/parenthesis has children, we want the structure of
  327. // the output line to be:
  328. // |- MACRO
  329. // |- (
  330. // | \- <argument>
  331. // |- ,
  332. // | \- <argument>
  333. // \- )
  334. llvm::SmallVector<MacroCallState> MacroCallStructure;
  335. // Level the generated UnwrappedLine will be at.
  336. const unsigned Level;
  337. // Maps from identifier of the macro call to an unwrapped line containing
  338. // all tokens of the macro call.
  339. const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
  340. &IdToReconstructed;
  341. };
  342. } // namespace format
  343. } // namespace clang
  344. #endif