aboutsummaryrefslogtreecommitdiff
path: root/clang/lib/Format/Macros.h
blob: 1964624e828ce3c2693153f248750e5c669ee096 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
//===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains the main building blocks of macro support in
/// clang-format.
///
/// In order to not violate the requirement that clang-format can format files
/// in isolation, clang-format's macro support uses expansions users provide
/// as part of clang-format's style configuration.
///
/// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
/// one level of expansion (\see MacroExpander for a full description of what
/// is supported).
///
/// As part of parsing, clang-format uses the MacroExpander to expand the
/// spelled token streams into expanded token streams when it encounters a
/// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
/// from the expanded token stream.
/// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
/// matches the spelled token stream into unwrapped lines that best resemble the
/// structure of the expanded unwrapped lines. These reconstructed unwrapped
/// lines are aliasing the tokens in the expanded token stream, so that token
/// annotations will be reused when formatting the spelled macro calls.
///
/// When formatting, clang-format annotates and formats the expanded unwrapped
/// lines first, determining the token types. Next, it formats the spelled
/// unwrapped lines, keeping the token types fixed, while allowing other
/// formatting decisions to change.
///
//===----------------------------------------------------------------------===//

#ifndef CLANG_LIB_FORMAT_MACROS_H
#define CLANG_LIB_FORMAT_MACROS_H

#include <list>
#include <map>
#include <string>
#include <vector>

#include "FormatToken.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"

namespace clang {
namespace format {

struct UnwrappedLine;
struct UnwrappedLineNode;

/// Takes a set of macro definitions as strings and allows expanding calls to
/// those macros.
///
/// For example:
/// Definition: A(x, y)=x + y
/// Call      : A(int a = 1, 2)
/// Expansion : int a = 1 + 2
///
/// Expansion does not check arity of the definition.
/// If fewer arguments than expected are provided, the remaining parameters
/// are considered empty:
/// Call     : A(a)
/// Expansion: a +
/// If more arguments than expected are provided, they will be discarded.
///
/// The expander does not support:
/// - recursive expansion
/// - stringification
/// - concatenation
/// - variadic macros
///
/// Furthermore, only a single expansion of each macro argument is supported,
/// so that we cannot get conflicting formatting decisions from different
/// expansions.
/// Definition: A(x)=x+x
/// Call      : A(id)
/// Expansion : id+x
///
class MacroExpander {
public:
  using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>;

  /// Construct a macro expander from a set of macro definitions.
  /// Macro definitions must be encoded as UTF-8.
  ///
  /// Each entry in \p Macros must conform to the following simple
  /// macro-definition language:
  /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
  /// <params>     ::= <id-list> | ""
  /// <id-list>    ::= <id> | <id> "," <params>
  /// <expansion>  ::= "=" <tail> | <eof>
  /// <tail>       ::= <tok> <tail> | <eof>
  ///
  /// Macros that cannot be parsed will be silently discarded.
  ///
  MacroExpander(const std::vector<std::string> &Macros,
                clang::SourceManager &SourceMgr, const FormatStyle &Style,
                llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
                IdentifierTable &IdentTable);
  ~MacroExpander();

  /// Returns whether any macro \p Name is defined, regardless of overloads.
  bool defined(llvm::StringRef Name) const;

  /// Returns whetherh there is an object-like overload, i.e. where the macro
  /// has no arguments and should not consume subsequent parentheses.
  bool objectLike(llvm::StringRef Name) const;

  /// Returns whether macro \p Name provides an overload with the given arity.
  bool hasArity(llvm::StringRef Name, unsigned Arity) const;

  /// Returns the expanded stream of format tokens for \p ID, where
  /// each element in \p Args is a positional argument to the macro call.
  /// If \p Args is not set, the object-like overload is used.
  /// If \p Args is set, the overload with the arity equal to \c Args.size() is
  /// used.
  llvm::SmallVector<FormatToken *, 8>
  expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const;

private:
  struct Definition;
  class DefinitionParser;

  void parseDefinition(const std::string &Macro);

  clang::SourceManager &SourceMgr;
  const FormatStyle &Style;
  llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
  IdentifierTable &IdentTable;
  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
  llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike;
  llvm::StringMap<Definition> ObjectLike;
};

/// Converts a sequence of UnwrappedLines containing expanded macros into a
/// single UnwrappedLine containing the macro calls.  This UnwrappedLine may be
/// broken into child lines, in a way that best conveys the structure of the
/// expanded code.
///
/// In the simplest case, a spelled UnwrappedLine contains one macro, and after
/// expanding it we have one expanded UnwrappedLine.  In general, macro
/// expansions can span UnwrappedLines, and multiple macros can contribute
/// tokens to the same line.  We keep consuming expanded lines until:
/// *   all expansions that started have finished (we're not chopping any macros
///     in half)
/// *   *and* we've reached the end of a *spelled* unwrapped line.
///
/// A single UnwrappedLine represents this chunk of code.
///
/// After this point, the state of the spelled/expanded stream is "in sync"
/// (both at the start of an UnwrappedLine, with no macros open), so the
/// Reconstructor can be thrown away and parsing can continue.
///
/// Given a mapping from the macro name identifier token in the macro call
/// to the tokens of the macro call, for example:
/// CLASSA -> CLASSA({public: void x();})
///
/// When getting the formatted lines of the expansion via the \c addLine method
/// (each '->' specifies a call to \c addLine ):
/// -> class A {
/// -> public:
/// ->   void x();
/// -> };
///
/// Creates the tree of unwrapped lines containing the macro call tokens so that
/// the macro call tokens fit the semantic structure of the expanded formatted
/// lines:
/// -> CLASSA({
/// -> public:
/// ->   void x();
/// -> })
class MacroCallReconstructor {
public:
  /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
  /// \p Level, using the map from name identifier token to the corresponding
  /// tokens of the spelled macro call.
  MacroCallReconstructor(
      unsigned Level,
      const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
          &ActiveExpansions);

  /// For the given \p Line, match all occurences of tokens expanded from a
  /// macro to unwrapped lines in the spelled macro call so that the resulting
  /// tree of unwrapped lines best resembles the structure of unwrapped lines
  /// passed in via \c addLine.
  void addLine(const UnwrappedLine &Line);

  /// Check whether at the current state there is no open macro expansion
  /// that needs to be processed to finish an macro call.
  /// Only when \c finished() is true, \c takeResult() can be called to retrieve
  /// the resulting \c UnwrappedLine.
  /// If there are multiple subsequent macro calls within an unwrapped line in
  /// the spelled token stream, the calling code may also continue to call
  /// \c addLine() when \c finished() is true.
  bool finished() const { return ActiveExpansions.empty(); }

  /// Retrieve the formatted \c UnwrappedLine containing the orginal
  /// macro calls, formatted according to the expanded token stream received
  /// via \c addLine().
  /// Generally, this line tries to have the same structure as the expanded,
  /// formatted unwrapped lines handed in via \c addLine(), with the exception
  /// that for multiple top-level lines, each subsequent line will be the
  /// child of the last token in its predecessor. This representation is chosen
  /// because it is a precondition to the formatter that we get what looks like
  /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
  ///
  /// If a token in a macro argument is a child of a token in the expansion,
  /// the parent will be the corresponding token in the macro call.
  /// For example:
  ///   #define C(a, b) class C { a b
  ///   C(int x;, int y;)
  /// would expand to
  ///   class C { int x; int y;
  /// where in a formatted line "int x;" and "int y;" would both be new separate
  /// lines.
  ///
  /// In the result, "int x;" will be a child of the opening parenthesis in "C("
  /// and "int y;" will be a child of the "," token:
  ///   C (
  ///     \- int x;
  ///     ,
  ///     \- int y;
  ///     )
  UnwrappedLine takeResult() &&;

private:
  void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
  void prepareParent(FormatToken *ExpandedParent, bool First);
  FormatToken *getParentInResult(FormatToken *Parent);
  void reconstruct(FormatToken *Token);
  void startReconstruction(FormatToken *Token);
  bool reconstructActiveCallUntil(FormatToken *Token);
  void endReconstruction(FormatToken *Token);
  bool processNextReconstructed();
  void finalize();

  struct ReconstructedLine;

  void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
  UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
  void debug(const ReconstructedLine &Line, int Level);
  ReconstructedLine &parentLine();
  ReconstructedLine *currentLine();
  void debugParentMap() const;

#ifndef NDEBUG
  enum ReconstructorState {
    Start,      // No macro expansion was found in the input yet.
    InProgress, // During a macro reconstruction.
    Finalized,  // Past macro reconstruction, the result is finalized.
  };
  ReconstructorState State = Start;
#endif

  // Node in which we build up the resulting unwrapped line; this type is
  // analogous to UnwrappedLineNode.
  struct LineNode {
    LineNode() = default;
    LineNode(FormatToken *Tok) : Tok(Tok) {}
    FormatToken *Tok = nullptr;
    llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children;
  };

  // Line in which we build up the resulting unwrapped line.
  // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
  // instead of rolling our own type.
  struct ReconstructedLine {
    llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
  };

  // The line in which we collect the resulting reconstructed output.
  // To reduce special cases in the algorithm, the first level of the line
  // contains a single null token that has the reconstructed incoming
  // lines as children.
  // In the end, we stich the lines together so that each subsequent line
  // is a child of the last token of the previous line. This is necessary
  // in order to format the overall expression as a single logical line -
  // if we created separate lines, we'd format them with their own top-level
  // indent depending on the semantic structure, which is not desired.
  ReconstructedLine Result;

  // Stack of currently "open" lines, where each line's predecessor's last
  // token is the parent token for that line.
  llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines;

  // Maps from the expanded token to the token that takes its place in the
  // reconstructed token stream in terms of parent-child relationships.
  // Note that it might take multiple steps to arrive at the correct
  // parent in the output.
  // Given: #define C(a, b) []() { a; b; }
  // And a call: C(f(), g())
  // The structure in the incoming formatted unwrapped line will be:
  // []() {
  //      |- f();
  //      \- g();
  // }
  // with f and g being children of the opening brace.
  // In the reconstructed call:
  // C(f(), g())
  //  \- f()
  //      \- g()
  // We want f to be a child of the opening parenthesis and g to be a child
  // of the comma token in the macro call.
  // Thus, we map
  // { -> (
  // and add
  // ( -> ,
  // once we're past the comma in the reconstruction.
  llvm::DenseMap<FormatToken *, FormatToken *>
      SpelledParentToReconstructedParent;

  // Keeps track of a single expansion while we're reconstructing tokens it
  // generated.
  struct Expansion {
    // The identifier token of the macro call.
    FormatToken *ID;
    // Our current position in the reconstruction.
    std::list<UnwrappedLineNode>::iterator SpelledI;
    // The end of the reconstructed token sequence.
    std::list<UnwrappedLineNode>::iterator SpelledE;
  };

  // Stack of macro calls for which we're in the middle of an expansion.
  llvm::SmallVector<Expansion> ActiveExpansions;

  struct MacroCallState {
    MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
                   FormatToken *MacroCallLParen);

    ReconstructedLine *Line;

    // The last token in the parent line or expansion, or nullptr if the macro
    // expansion is on a top-level line.
    //
    // For example, in the macro call:
    //   auto f = []() { ID(1); };
    // The MacroCallState for ID will have '{' as ParentLastToken.
    //
    // In the macro call:
    //   ID(ID(void f()));
    // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
    // while the MacroCallState for the inner ID will have the '(' of the outer
    // ID as ParentLastToken.
    //
    // In the macro call:
    //   ID2(a, ID(b));
    // The MacroCallState of ID will have ',' as ParentLastToken.
    FormatToken *ParentLastToken;

    // The l_paren of this MacroCallState's macro call.
    FormatToken *MacroCallLParen;
  };

  // Keeps track of the lines into which the opening brace/parenthesis &
  // argument separating commas for each level in the macro call go in order to
  // put the corresponding closing brace/parenthesis into the same line in the
  // output and keep track of which parents in the expanded token stream map to
  // which tokens in the reconstructed stream.
  // When an opening brace/parenthesis has children, we want the structure of
  // the output line to be:
  // |- MACRO
  // |- (
  // |  \- <argument>
  // |- ,
  // |  \- <argument>
  // \- )
  llvm::SmallVector<MacroCallState> MacroCallStructure;

  // Level the generated UnwrappedLine will be at.
  const unsigned Level;

  // Maps from identifier of the macro call to an unwrapped line containing
  // all tokens of the macro call.
  const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
      &IdToReconstructed;
};

} // namespace format
} // namespace clang

#endif