aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
blob: afdf1655b462231449b0e8663f25c52132b96cec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file provides utility VPlan to VPlan transformations.
//===----------------------------------------------------------------------===//

#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H

#include "VPlan.h"
#include "VPlanVerifier.h"
#include "llvm/ADT/STLFunctionalExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"

namespace llvm {

class InductionDescriptor;
class Instruction;
class LoopVersioning;
class PHINode;
class ScalarEvolution;
class PredicatedScalarEvolution;
class TargetLibraryInfo;
class VPBuilder;
class VPRecipeBuilder;
struct VFRange;

LLVM_ABI_FOR_TEST extern cl::opt<bool> VerifyEachVPlan;
LLVM_ABI_FOR_TEST extern cl::opt<bool> EnableWideActiveLaneMask;

struct VPlanTransforms {
  /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
  /// arguments to the transform. Returns the boolean returned by the transform.
  template <typename... ArgsTy>
  static bool runPass(bool (*Transform)(VPlan &, ArgsTy...), VPlan &Plan,
                      typename std::remove_reference<ArgsTy>::type &...Args) {
    bool Res = Transform(Plan, Args...);
    if (VerifyEachVPlan)
      verifyVPlanIsValid(Plan);
    return Res;
  }
  /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
  /// arguments to the transform.
  template <typename... ArgsTy>
  static void runPass(void (*Fn)(VPlan &, ArgsTy...), VPlan &Plan,
                      typename std::remove_reference<ArgsTy>::type &...Args) {
    Fn(Plan, Args...);
    if (VerifyEachVPlan)
      verifyVPlanIsValid(Plan);
  }

  /// Create a base VPlan0, serving as the common starting point for all later
  /// candidates. It consists of an initial plain CFG loop with loop blocks from
  /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
  /// corresponding to the input IR.
  ///
  /// The created loop is wrapped in an initial skeleton to facilitate
  /// vectorization, consisting of a vector pre-header, an exit block for the
  /// main vector loop (middle.block) and a new block as preheader of the scalar
  /// loop (scalar.ph). See below for an illustration. It also adds a canonical
  /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
  /// VPValue expression for the original trip count.
  ///
  ///    [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
  ///    / \       old preheader. Will contain iteration number check and SCEV
  ///   |   |      expansions.
  ///   |   |
  ///   /   v
  ///  |   [ ] <-- vector loop bypass (may consist of multiple blocks) will be
  ///  |  / |      added later.
  ///  | /  v
  ///  ||  [ ]     <-- vector pre header.
  ///  |/   |
  ///  |    v
  ///  |   [  ] \  <-- plain CFG loop wrapping original loop to be vectorized.
  ///  |   [  ]_|
  ///  |    |
  ///  |    v
  ///  |   [ ]   <--- middle-block with the branch to successors
  ///  |   / |
  ///  |  /  |
  ///  | |   v
  ///  \--->[ ]   <--- scalar preheader (initial a VPBasicBlock, which will be
  ///    |   |        replaced later by a VPIRBasicBlock wrapping the scalar
  ///    |   |         preheader basic block.
  ///    |   |
  ///        v      <-- edge from middle to exit iff epilogue is not required.
  ///    |  [ ] \
  ///    |  [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue,
  ///    |   |          header wrapped in VPIRBasicBlock).
  ///    \   |
  ///     \  v
  ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
  LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
  buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
              PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);

  /// Update \p Plan to account for all early exits.
  LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
                                                 bool HasUncountableExit);

  /// If a check is needed to guard executing the scalar epilogue loop, it will
  /// be added to the middle block.
  LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan,
                                               bool RequiresScalarEpilogueCheck,
                                               bool TailFolded);

  // Create a check to \p Plan to see if the vector loop should be executed.
  static void addMinimumIterationCheck(
      VPlan &Plan, ElementCount VF, unsigned UF,
      ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
      bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
      const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE);

  /// Add a check to \p Plan to see if the epilogue vector loop should be
  /// executed.
  static void addMinimumVectorEpilogueIterationCheck(
      VPlan &Plan, Value *TripCount, Value *VectorTripCount,
      bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
      unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE);

  /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
  /// flat CFG into a hierarchical CFG.
  LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);

  /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
  /// VPValue and connect the block to \p Plan, using the VPValue as branch
  /// condition.
  static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
                               bool AddBranchWeights);

  /// Replaces the VPInstructions in \p Plan with corresponding
  /// widen recipes. Returns false if any VPInstructions could not be converted
  /// to a wide recipe if needed.
  LLVM_ABI_FOR_TEST static bool tryToConvertVPInstructionsToVPRecipes(
      VPlan &Plan,
      function_ref<const InductionDescriptor *(PHINode *)>
          GetIntOrFpInductionDescriptor,
      const TargetLibraryInfo &TLI);

  /// Try to legalize reductions with multiple in-loop uses. Currently only
  /// min/max reductions used by FindLastIV reductions are supported. Otherwise
  /// return false.
  static bool handleMultiUseReductions(VPlan &Plan);

  /// Try to have all users of fixed-order recurrences appear after the recipe
  /// defining their previous value, by either sinking users or hoisting recipes
  /// defining their previous value (and its operands). Then introduce
  /// FirstOrderRecurrenceSplice VPInstructions to combine the value from the
  /// recurrence phis and previous values.
  /// \returns true if all users of fixed-order recurrences could be re-arranged
  /// as needed or false if it is not possible. In the latter case, \p Plan is
  /// not valid.
  static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);

  /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
  /// try to update the vector loop to exit early if any input is NaN and resume
  /// executing in the scalar loop to handle the NaNs there. Return false if
  /// this attempt was unsuccessful.
  static bool handleMaxMinNumReductions(VPlan &Plan);

  /// Clear NSW/NUW flags from reduction instructions if necessary.
  static void clearReductionWrapFlags(VPlan &Plan);

  /// Explicitly unroll \p Plan by \p UF.
  static void unrollByUF(VPlan &Plan, unsigned UF);

  /// Replace each replicating VPReplicateRecipe and VPInstruction outside of
  /// any replicate region in \p Plan with \p VF single-scalar recipes.
  /// TODO: Also replicate VPScalarIVSteps and VPReplicateRecipes inside
  /// replicate regions, thereby dissolving the latter.
  static void replicateByVF(VPlan &Plan, ElementCount VF);

  /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
  /// resulting plan to \p BestVF and \p BestUF.
  static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                 unsigned BestUF,
                                 PredicatedScalarEvolution &PSE);

  /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
  /// optimizations, dead recipe removal, replicate region optimizations and
  /// block merging.
  LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);

  /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
  /// region block and remove the mask operand. Optimize the created regions by
  /// iteratively sinking scalar operands into the region, followed by merging
  /// regions until no improvements are remaining.
  static void createAndOptimizeReplicateRegions(VPlan &Plan);

  /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
  /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
  /// UseActiveLaneMaskForControlFlow is true, introduce an
  /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is
  /// true, no minimum-iteration runtime check will be created (during skeleton
  /// creation) and instead it is handled using active-lane-mask. \p
  /// DataAndControlFlowWithoutRuntimeCheck implies \p
  /// UseActiveLaneMaskForControlFlow.
  static void addActiveLaneMask(VPlan &Plan,
                                bool UseActiveLaneMaskForControlFlow,
                                bool DataAndControlFlowWithoutRuntimeCheck);

  /// Insert truncates and extends for any truncated recipe. Redundant casts
  /// will be folded later.
  static void
  truncateToMinimalBitwidths(VPlan &Plan,
                             const MapVector<Instruction *, uint64_t> &MinBWs);

  /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
  /// possible.
  static void
  replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
                         const DenseMap<Value *, const SCEV *> &StridesMap);

  /// Drop poison flags from recipes that may generate a poison value that is
  /// used after vectorization, even when their operands are not poison. Those
  /// recipes meet the following conditions:
  ///  * Contribute to the address computation of a recipe generating a widen
  ///    memory load/store (VPWidenMemoryInstructionRecipe or
  ///    VPInterleaveRecipe).
  ///  * Such a widen memory load/store has at least one underlying Instruction
  ///    that is in a basic block that needs predication and after vectorization
  ///    the generated instruction won't be predicated.
  /// Uses \p BlockNeedsPredication to check if a block needs predicating.
  /// TODO: Replace BlockNeedsPredication callback with retrieving info from
  ///       VPlan directly.
  static void dropPoisonGeneratingRecipes(
      VPlan &Plan,
      const std::function<bool(BasicBlock *)> &BlockNeedsPredication);

  /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
  /// replaces all uses except the canonical IV increment of
  /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
  /// VPCanonicalIVPHIRecipe is only used to control the loop after
  /// this transformation.
  static void
  addExplicitVectorLength(VPlan &Plan,
                          const std::optional<unsigned> &MaxEVLSafeElements);

  // For each Interleave Group in \p InterleaveGroups replace the Recipes
  // widening its memory instructions with a single VPInterleaveRecipe at its
  // insertion point.
  static void createInterleaveGroups(
      VPlan &Plan,
      const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
          &InterleaveGroups,
      VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);

  /// Remove dead recipes from \p Plan.
  static void removeDeadRecipes(VPlan &Plan);

  /// Update \p Plan to account for the uncountable early exit from \p
  /// EarlyExitingVPBB to \p EarlyExitVPBB by
  ///  * updating the condition exiting the loop via the latch to include the
  ///    early exit condition,
  ///  * splitting the original middle block to branch to the early exit block
  ///    conditionally - according to the early exit condition.
  static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
                                         VPBasicBlock *EarlyExitVPBB,
                                         VPlan &Plan, VPBasicBlock *HeaderVPBB,
                                         VPBasicBlock *LatchVPBB);

  /// Replace loop regions with explicit CFG.
  static void dissolveLoopRegions(VPlan &Plan);

  /// Transform EVL loops to use variable-length stepping after region
  /// dissolution.
  ///
  /// Once loop regions are replaced with explicit CFG, EVL loops can step with
  /// variable vector lengths instead of fixed lengths. This transformation:
  ///  * Makes EVL-Phi concrete.
  //   * Removes CanonicalIV and increment.
  ///  * Replaces the exit condition from
  ///      (branch-on-count CanonicalIVInc, VectorTripCount)
  ///    to
  ///      (branch-on-cond eq AVLNext, 0)
  static void canonicalizeEVLLoops(VPlan &Plan);

  /// Lower abstract recipes to concrete ones, that can be codegen'd.
  static void convertToConcreteRecipes(VPlan &Plan);

  /// This function converts initial recipes to the abstract recipes and clamps
  /// \p Range based on cost model for following optimizations and cost
  /// estimations. The converted abstract recipes will lower to concrete
  /// recipes before codegen.
  static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                       VFRange &Range);

  /// Perform instcombine-like simplifications on recipes in \p Plan.
  static void simplifyRecipes(VPlan &Plan);

  /// Remove BranchOnCond recipes with true or false conditions together with
  /// removing dead edges to their successors.
  static void removeBranchOnConst(VPlan &Plan);

  /// Perform common-subexpression-elimination on \p Plan.
  static void cse(VPlan &Plan);

  /// If there's a single exit block, optimize its phi recipes that use exiting
  /// IV values by feeding them precomputed end values instead, possibly taken
  /// one step backwards.
  static void
  optimizeInductionExitUsers(VPlan &Plan,
                             DenseMap<VPValue *, VPValue *> &EndValues,
                             ScalarEvolution &SE);

  /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
  static void materializeBroadcasts(VPlan &Plan);

  /// Hoist single-scalar loads with invariant addresses out of the vector loop
  /// to the preheader, if they are proven not to alias with any stores in the
  /// plan using noalias metadata.
  static void hoistInvariantLoads(VPlan &Plan);

  /// Hoist predicated loads from the same address to the loop entry block, if
  /// they are guaranteed to execute on both paths (i.e., in replicate regions
  /// with complementary masks P and NOT P).
  static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
                                   const Loop *L);

  /// Sink predicated stores to the same address with complementary predicates
  /// (P and NOT P) to an unconditional store with select recipes for the
  /// stored values. This eliminates branching overhead when all paths
  /// unconditionally store to the same location.
  static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
                                   const Loop *L);

  // Materialize vector trip counts for constants early if it can simply be
  // computed as (Original TC / VF * UF) * VF * UF.
  static void
  materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
                                     unsigned BestUF,
                                     PredicatedScalarEvolution &PSE);

  /// Materialize vector trip count computations to a set of VPInstructions.
  static void materializeVectorTripCount(VPlan &Plan,
                                         VPBasicBlock *VectorPHVPBB,
                                         bool TailByMasking,
                                         bool RequiresScalarEpilogue);

  /// Materialize the backedge-taken count to be computed explicitly using
  /// VPInstructions.
  static void materializeBackedgeTakenCount(VPlan &Plan,
                                            VPBasicBlock *VectorPH);

  /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
  /// into vectors and Unpack recipes to extract scalars from vectors as
  /// needed.
  static void materializePacksAndUnpacks(VPlan &Plan);

  /// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
  static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
                                    ElementCount VF);

  /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each
  /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR
  /// value. A mapping from SCEV expressions to their expanded IR value is
  /// returned.
  static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
                                                     ScalarEvolution &SE);

  /// Try to convert a plan with interleave groups with VF elements to a plan
  /// with the interleave groups replaced by wide loads and stores processing VF
  /// elements, if all transformed interleave groups access the full vector
  /// width (checked via \o VectorRegWidth). This effectively is a very simple
  /// form of loop-aware SLP, where we use interleave groups to identify
  /// candidates.
  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                     TypeSize VectorRegWidth);

  /// Predicate and linearize the control-flow in the only loop region of
  /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
  /// header, otherwise use all-true for the header mask. Masks for blocks are
  /// added to a block-to-mask map which is returned in order to be used later
  /// for wide recipe construction. This argument is temporary and will be
  /// removed in the future.
  static DenseMap<VPBasicBlock *, VPValue *>
  introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);

  /// Add branch weight metadata, if the \p Plan's middle block is terminated by
  /// a BranchOnCond recipe.
  static void
  addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
                                    std::optional<unsigned> VScaleForTuning);

  /// Update the resume phis in the scalar preheader after creating wide recipes
  /// for first-order recurrences, reductions and inductions. End values for
  /// inductions are added to \p IVEndValues.
  static void
  updateScalarResumePhis(VPlan &Plan,
                         DenseMap<VPValue *, VPValue *> &IVEndValues);

  /// Handle users in the exit block for first order reductions in the original
  /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
  /// users in the original exit block using the VPIRInstruction wrapping to the
  /// LCSSA phi.
  static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
};

} // namespace llvm

#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H