aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp119
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp10
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp14
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp34
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp39
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp40
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp231
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp126
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp199
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h13
13 files changed, 556 insertions, 300 deletions
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 8d9a0e7..50130da 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -2067,6 +2067,36 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
AI.run(SCCNodes, Changed);
}
+// Determines if the function 'F' can be marked 'norecurse'.
+// It returns true if any call within 'F' could lead to a recursive
+// call back to 'F', and false otherwise.
+// The 'AnyFunctionsAddressIsTaken' parameter is a module-wide flag
+// that is true if any function's address is taken, or if any function
+// has external linkage. This is used to determine the safety of
+// external/library calls.
+static bool mayHaveRecursiveCallee(Function &F,
+ bool AnyFunctionsAddressIsTaken = true) {
+ for (const auto &BB : F) {
+ for (const auto &I : BB.instructionsWithoutDebug()) {
+ if (const auto *CB = dyn_cast<CallBase>(&I)) {
+ const Function *Callee = CB->getCalledFunction();
+ if (!Callee || Callee == &F)
+ return true;
+
+ if (Callee->doesNotRecurse())
+ continue;
+
+ if (!AnyFunctionsAddressIsTaken ||
+ (Callee->isDeclaration() &&
+ Callee->hasFnAttribute(Attribute::NoCallback)))
+ continue;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
SmallPtrSet<Function *, 8> &Changed) {
// Try and identify functions that do not recurse.
@@ -2078,28 +2108,14 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
Function *F = *SCCNodes.begin();
if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
return;
-
- // If all of the calls in F are identifiable and are to norecurse functions, F
- // is norecurse. This check also detects self-recursion as F is not currently
- // marked norecurse, so any called from F to F will not be marked norecurse.
- for (auto &BB : *F)
- for (auto &I : BB.instructionsWithoutDebug())
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- Function *Callee = CB->getCalledFunction();
- if (!Callee || Callee == F ||
- (!Callee->doesNotRecurse() &&
- !(Callee->isDeclaration() &&
- Callee->hasFnAttribute(Attribute::NoCallback))))
- // Function calls a potentially recursive function.
- return;
- }
-
- // Every call was to a non-recursive function other than this function, and
- // we have no indirect recursion as the SCC size is one. This function cannot
- // recurse.
- F->setDoesNotRecurse();
- ++NumNoRecurse;
- Changed.insert(F);
+ if (!mayHaveRecursiveCallee(*F)) {
+ // Every call was to a non-recursive function other than this function, and
+ // we have no indirect recursion as the SCC size is one. This function
+ // cannot recurse.
+ F->setDoesNotRecurse();
+ ++NumNoRecurse;
+ Changed.insert(F);
+ }
}
// Set the noreturn function attribute if possible.
@@ -2429,3 +2445,62 @@ ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
PA.preserve<LazyCallGraphAnalysis>();
return PA;
}
+
+PreservedAnalyses NoRecurseLTOInferencePass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+
+ // Check if any function in the whole program has its address taken or has
+ // potentially external linkage.
+ // We use this information when inferring norecurse attribute: If there is
+ // no function whose address is taken and all functions have internal
+ // linkage, there is no path for a callback to any user function.
+ bool AnyFunctionsAddressIsTaken = false;
+ for (Function &F : M) {
+ if (F.isDeclaration() || F.doesNotRecurse())
+ continue;
+ if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
+ AnyFunctionsAddressIsTaken = true;
+ break;
+ }
+ }
+
+ // Run norecurse inference on all RefSCCs in the LazyCallGraph for this
+ // module.
+ bool Changed = false;
+ LazyCallGraph &CG = MAM.getResult<LazyCallGraphAnalysis>(M);
+ CG.buildRefSCCs();
+
+ for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
+ // Skip any RefSCC that is part of a call cycle. A RefSCC containing more
+ // than one SCC indicates a recursive relationship involving indirect calls.
+ if (RC.size() > 1)
+ continue;
+
+ // RefSCC contains a single-SCC. SCC size > 1 indicates mutually recursive
+ // functions. Ex: foo1 -> foo2 -> foo3 -> foo1.
+ LazyCallGraph::SCC &S = *RC.begin();
+ if (S.size() > 1)
+ continue;
+
+ // Get the single function from this SCC.
+ Function &F = S.begin()->getFunction();
+ if (!F.hasExactDefinition() || F.doesNotRecurse())
+ continue;
+
+ // If the analysis confirms that this function has no recursive calls
+ // (either direct, indirect, or through external linkages),
+ // we can safely apply the norecurse attribute.
+ if (!mayHaveRecursiveCallee(F, AnyFunctionsAddressIsTaken)) {
+ F.setDoesNotRecurse();
+ ++NumNoRecurse;
+ Changed = true;
+ }
+ }
+
+ PreservedAnalyses PA;
+ if (Changed)
+ PA.preserve<LazyCallGraphAnalysis>();
+ else
+ PA = PreservedAnalyses::all();
+ return PA;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8f60e50..8c8fc69 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3356,7 +3356,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
impliesPoisonOrCond(FalseVal, B, /*Expected=*/false)) {
// (A || B) || C --> A || (B | C)
return replaceInstUsesWith(
- SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal)));
+ SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal), "",
+ ProfcheckDisableMetadataFixes
+ ? nullptr
+ : cast<SelectInst>(CondVal)));
}
// (A && B) || (C && B) --> (A || C) && B
@@ -3398,7 +3401,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
impliesPoisonOrCond(TrueVal, B, /*Expected=*/true)) {
// (A && B) && C --> A && (B & C)
return replaceInstUsesWith(
- SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal)));
+ SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal), "",
+ ProfcheckDisableMetadataFixes
+ ? nullptr
+ : cast<SelectInst>(CondVal)));
}
// (A || B) && (C || B) --> (A && C) || B
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 480ff4a..5ba2167 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -261,6 +261,11 @@ static cl::opt<bool> ClIgnorePersonalityRoutine(
"list, do not create a wrapper for it."),
cl::Hidden, cl::init(false));
+static cl::opt<bool> ClAddGlobalNameSuffix(
+ "dfsan-add-global-name-suffix",
+ cl::desc("Whether to add .dfsan suffix to global names"), cl::Hidden,
+ cl::init(true));
+
static StringRef getGlobalTypeString(const GlobalValue &G) {
// Types of GlobalVariables are always pointer types.
Type *GType = G.getValueType();
@@ -1256,6 +1261,9 @@ DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
}
void DataFlowSanitizer::addGlobalNameSuffix(GlobalValue *GV) {
+ if (!ClAddGlobalNameSuffix)
+ return;
+
std::string GVName = std::string(GV->getName()), Suffix = ".dfsan";
GV->setName(GVName + Suffix);
@@ -1784,10 +1792,8 @@ bool DataFlowSanitizer::runImpl(
}
Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
- Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy);
- if (ArgOffset)
- Base = IRB.CreateAdd(Base, ConstantInt::get(DFS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(*DFS.Ctx, 0), "_dfsarg");
+ return IRB.CreatePtrAdd(DFS.ArgTLS, ConstantInt::get(DFS.IntptrTy, ArgOffset),
+ "_dfsarg");
}
Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e9a3e98..7968a5d 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -120,6 +120,12 @@ static cl::opt<unsigned>
cl::desc("Maximum cost accepted for the transformation"),
cl::Hidden, cl::init(50));
+static cl::opt<double> MaxClonedRate(
+ "dfa-max-cloned-rate",
+ cl::desc(
+ "Maximum cloned instructions rate accepted for the transformation"),
+ cl::Hidden, cl::init(7.5));
+
namespace {
class SelectInstToUnfold {
@@ -828,6 +834,7 @@ private:
/// also returns false if it is illegal to clone some required block.
bool isLegalAndProfitableToTransform() {
CodeMetrics Metrics;
+ uint64_t NumClonedInst = 0;
SwitchInst *Switch = SwitchPaths->getSwitchInst();
// Don't thread switch without multiple successors.
@@ -837,7 +844,6 @@ private:
// Note that DuplicateBlockMap is not being used as intended here. It is
// just being used to ensure (BB, State) pairs are only counted once.
DuplicateBlockMap DuplicateMap;
-
for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
PathType PathBBs = TPath.getPath();
APInt NextState = TPath.getExitValue();
@@ -848,6 +854,7 @@ private:
BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap);
if (!VisitedBB) {
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ NumClonedInst += BB->sizeWithoutDebug();
DuplicateMap[BB].push_back({BB, NextState});
}
@@ -865,6 +872,7 @@ private:
if (VisitedBB)
continue;
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ NumClonedInst += BB->sizeWithoutDebug();
DuplicateMap[BB].push_back({BB, NextState});
}
@@ -901,6 +909,22 @@ private:
}
}
+ // Too much cloned instructions slow down later optimizations, especially
+ // SLPVectorizer.
+ // TODO: Thread the switch partially before reaching the threshold.
+ uint64_t NumOrigInst = 0;
+ for (auto *BB : DuplicateMap.keys())
+ NumOrigInst += BB->sizeWithoutDebug();
+ if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) {
+ LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
+ "instructions wll be cloned\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch)
+ << "Too much instructions will be cloned.";
+ });
+ return false;
+ }
+
InstructionCost DuplicationCost = 0;
unsigned JumpTableSize = 0;
@@ -969,14 +993,14 @@ private:
SmallPtrSet<BasicBlock *, 16> BlocksToClean;
BlocksToClean.insert_range(successors(SwitchBlock));
- for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+ for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU);
NumPaths++;
}
// After all paths are cloned, now update the last successor of the cloned
// path so it skips over the switch statement
- for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
+ for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
updateLastSuccessor(TPath, DuplicateMap, &DTU);
// For each instruction that was cloned and used outside, update its uses
@@ -993,7 +1017,7 @@ private:
/// To remember the correct destination, we have to duplicate blocks
/// corresponding to each state. Also update the terminating instruction of
/// the predecessors, and phis in the successor blocks.
- void createExitPath(DefMap &NewDefs, ThreadingPath &Path,
+ void createExitPath(DefMap &NewDefs, const ThreadingPath &Path,
DuplicateBlockMap &DuplicateMap,
SmallPtrSet<BasicBlock *, 16> &BlocksToClean,
DomTreeUpdater *DTU) {
@@ -1239,7 +1263,7 @@ private:
///
/// Note that this is an optional step and would have been done in later
/// optimizations, but it makes the CFG significantly easier to work with.
- void updateLastSuccessor(ThreadingPath &TPath,
+ void updateLastSuccessor(const ThreadingPath &TPath,
DuplicateBlockMap &DuplicateMap,
DomTreeUpdater *DTU) {
APInt NextState = TPath.getExitValue();
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index af216cd..9693ae6 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -317,24 +317,29 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
// Early exit if we know nothing about X.
if (LRange.isFullSet())
return nullptr;
- // We are allowed to refine the comparison to either true or false for out
- // of range inputs. Here we refine the comparison to true, i.e. we relax
- // the range check.
- auto NewCR = CR->exactUnionWith(LRange.inverse());
- // TODO: Check if we can narrow the range check to an equality test.
- // E.g, for X in [0, 4), X - 3 u< 2 -> X == 3
- if (!NewCR)
+ auto ConvertCRToICmp =
+ [&](const std::optional<ConstantRange> &NewCR) -> Value * {
+ ICmpInst::Predicate Pred;
+ APInt RHS;
+ // Check if we can represent NewCR as an icmp predicate.
+ if (NewCR && NewCR->getEquivalentICmp(Pred, RHS)) {
+ IRBuilder<NoFolder> Builder(&Inst);
+ Value *NewICmp =
+ Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
+ InsertedValues.insert(NewICmp);
+ return NewICmp;
+ }
return nullptr;
- ICmpInst::Predicate Pred;
- APInt RHS;
- // Check if we can represent NewCR as an icmp predicate.
- if (NewCR->getEquivalentICmp(Pred, RHS)) {
- IRBuilder<NoFolder> Builder(&Inst);
- Value *NewICmp =
- Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
- InsertedValues.insert(NewICmp);
- return NewICmp;
- }
+ };
+ // We are allowed to refine the comparison to either true or false for out
+ // of range inputs.
+ // Here we refine the comparison to false, and check if we can narrow the
+ // range check to a simpler test.
+ if (auto *V = ConvertCRToICmp(CR->exactIntersectWith(LRange)))
+ return V;
+ // Here we refine the comparison to true, i.e. we relax the range check.
+ if (auto *V = ConvertCRToICmp(CR->exactUnionWith(LRange.inverse())))
+ return V;
}
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 148bfa8..b8cfe3a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4895,9 +4895,8 @@ bool SimplifyCFGOpt::simplifyTerminatorOnSelect(Instruction *OldTerm,
// We found both of the successors we were looking for.
// Create a conditional branch sharing the condition of the select.
BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
- if (TrueWeight != FalseWeight)
- setBranchWeights(*NewBI, {TrueWeight, FalseWeight},
- /*IsExpected=*/false, /*ElideAllZero=*/true);
+ setBranchWeights(*NewBI, {TrueWeight, FalseWeight},
+ /*IsExpected=*/false, /*ElideAllZero=*/true);
}
} else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
// Neither of the selected blocks were successors, so this
@@ -4982,9 +4981,15 @@ bool SimplifyCFGOpt::simplifyIndirectBrOnSelect(IndirectBrInst *IBI,
BasicBlock *TrueBB = TBA->getBasicBlock();
BasicBlock *FalseBB = FBA->getBasicBlock();
+ // The select's profile becomes the profile of the conditional branch that
+ // replaces the indirect branch.
+ SmallVector<uint32_t> SelectBranchWeights(2);
+ if (!ProfcheckDisableMetadataFixes)
+ extractBranchWeights(*SI, SelectBranchWeights);
// Perform the actual simplification.
- return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
- 0);
+ return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
+ SelectBranchWeights[0],
+ SelectBranchWeights[1]);
}
/// This is called when we find an icmp instruction
@@ -7952,19 +7957,27 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
BasicBlock *BB = IBI->getParent();
bool Changed = false;
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights = !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*IBI, BranchWeights);
+
+ DenseMap<const BasicBlock *, uint64_t> TargetWeight;
+ if (HasBranchWeights)
+ for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I)
+ TargetWeight[IBI->getDestination(I)] += BranchWeights[I];
// Eliminate redundant destinations.
SmallPtrSet<Value *, 8> Succs;
SmallSetVector<BasicBlock *, 8> RemovedSuccs;
- for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
- BasicBlock *Dest = IBI->getDestination(i);
+ for (unsigned I = 0, E = IBI->getNumDestinations(); I != E; ++I) {
+ BasicBlock *Dest = IBI->getDestination(I);
if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
if (!Dest->hasAddressTaken())
RemovedSuccs.insert(Dest);
Dest->removePredecessor(BB);
- IBI->removeDestination(i);
- --i;
- --e;
+ IBI->removeDestination(I);
+ --I;
+ --E;
Changed = true;
}
}
@@ -7990,7 +8003,12 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
eraseTerminatorAndDCECond(IBI);
return true;
}
-
+ if (HasBranchWeights) {
+ SmallVector<uint64_t> NewBranchWeights(IBI->getNumDestinations());
+ for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I)
+ NewBranchWeights[I] += TargetWeight.find(IBI->getDestination(I))->second;
+ setFittedBranchWeights(*IBI, NewBranchWeights, /*IsExpected=*/false);
+ }
if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
if (simplifyIndirectBrOnSelect(IBI, SI))
return requestResimplify();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56a3d6d..cee08ef 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
@@ -8201,211 +8204,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
}
}
-/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
-/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
-/// the end value of the induction.
-static VPInstruction *addResumePhiRecipeForInduction(
- VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
- VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
- auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
- // Truncated wide inductions resume from the last lane of their vector value
- // in the last vector iteration which is handled elsewhere.
- if (WideIntOrFp && WideIntOrFp->getTruncInst())
- return nullptr;
-
- VPValue *Start = WideIV->getStartValue();
- VPValue *Step = WideIV->getStepValue();
- const InductionDescriptor &ID = WideIV->getInductionDescriptor();
- VPValue *EndValue = VectorTC;
- if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
- EndValue = VectorPHBuilder.createDerivedIV(
- ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
- Start, VectorTC, Step);
- }
-
- // EndValue is derived from the vector trip count (which has the same type as
- // the widest induction) and thus may be wider than the induction here.
- Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
- if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
- EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
- ScalarTypeOfWideIV,
- WideIV->getDebugLoc());
- }
-
- auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
- {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
- return ResumePhiRecipe;
-}
-
-/// Create resume phis in the scalar preheader for first-order recurrences,
-/// reductions and inductions, and update the VPIRInstructions wrapping the
-/// original phis in the scalar header. End values for inductions are added to
-/// \p IVEndValues.
-static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
- DenseMap<VPValue *, VPValue *> &IVEndValues) {
- VPTypeAnalysis TypeInfo(Plan);
- auto *ScalarPH = Plan.getScalarPreheader();
- auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
- VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
- VPBuilder VectorPHBuilder(
- cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
- VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
- VPBuilder ScalarPHBuilder(ScalarPH);
- for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
- auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
-
- // TODO: Extract final value from induction recipe initially, optimize to
- // pre-computed end value together in optimizeInductionExitUsers.
- auto *VectorPhiR =
- cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
- if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
- if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
- WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
- &Plan.getVectorTripCount())) {
- assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
- IVEndValues[WideIVR] = ResumePhi->getOperand(0);
- ScalarPhiIRI->addOperand(ResumePhi);
- continue;
- }
- // TODO: Also handle truncated inductions here. Computing end-values
- // separately should be done as VPlan-to-VPlan optimization, after
- // legalizing all resume values to use the last lane from the loop.
- assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
- "should only skip truncated wide inductions");
- continue;
- }
-
- // The backedge value provides the value to resume coming out of a loop,
- // which for FORs is a vector whose last element needs to be extracted. The
- // start value provides the value if the loop is bypassed.
- bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
- auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
- assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
- "Cannot handle loops with uncountable early exits");
- if (IsFOR)
- ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
- "vector.recur.extract");
- StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
- auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
- {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
- ScalarPhiIRI->addOperand(ResumePhiR);
- }
-}
-
-/// Handle users in the exit block for first order reductions in the original
-/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
-/// users in the original exit block using the VPIRInstruction wrapping to the
-/// LCSSA phi.
-static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
- VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
- auto *ScalarPHVPBB = Plan.getScalarPreheader();
- auto *MiddleVPBB = Plan.getMiddleBlock();
- VPBuilder ScalarPHBuilder(ScalarPHVPBB);
- VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-
- auto IsScalableOne = [](ElementCount VF) -> bool {
- return VF == ElementCount::getScalable(1);
- };
-
- for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
- auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
- if (!FOR)
- continue;
-
- assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
- "Cannot handle loops with uncountable early exits");
-
- // This is the second phase of vectorizing first-order recurrences, creating
- // extract for users outside the loop. An overview of the transformation is
- // described below. Suppose we have the following loop with some use after
- // the loop of the last a[i-1],
- //
- // for (int i = 0; i < n; ++i) {
- // t = a[i - 1];
- // b[i] = a[i] - t;
- // }
- // use t;
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s.init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // use = lcssa.phi [s1, scalar.body]
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
- // for users in the scalar preheader and exit block.
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3]
- // b[i] = v2 - v1
- // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v1
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // vector.recur.extract.for.phi = v2(2)
- // vector.recur.extract = v2(3)
- // br cond, scalar.ph, exit.block
- //
- // scalar.ph:
- // scalar.recur.init = phi [vector.recur.extract, middle.block],
- // [s.init, otherwise]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // lo = lcssa.phi [s1, scalar.body],
- // [vector.recur.extract.for.phi, middle.block]
- //
- // Now update VPIRInstructions modeling LCSSA phis in the exit block.
- // Extract the penultimate value of the recurrence and use it as operand for
- // the VPIRInstruction modeling the phi.
- for (VPUser *U : FOR->users()) {
- using namespace llvm::VPlanPatternMatch;
- if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
- continue;
- // For VF vscale x 1, if vscale = 1, we are unable to extract the
- // penultimate value of the recurrence. Instead we rely on the existing
- // extract of the last element from the result of
- // VPInstruction::FirstOrderRecurrenceSplice.
- // TODO: Consider vscale_range info and UF.
- if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
- Range))
- return;
- VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
- {}, "vector.recur.extract.for.phi");
- cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
- }
- }
-}
-
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
@@ -8598,9 +8396,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
R->setOperand(1, WideIV->getStepValue());
}
- addExitUsersForFirstOrderRecurrences(*Plan, Range);
+ // TODO: We can't call runPass on these transforms yet, due to verifier
+ // failures.
+ VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range);
DenseMap<VPValue *, VPValue *> IVEndValues;
- addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
+ VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8621,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -8711,7 +8512,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
DenseMap<VPValue *, VPValue *> IVEndValues;
// TODO: IVEndValues are not used yet in the native path, to optimize exit
// values.
- addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
+ // TODO: We can't call runPass on the transform yet, due to verifier
+ // failures.
+ VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
@@ -10075,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
- CM.CostKind);
+ CM.CostKind, *CM.PSE.getSE());
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fedca65..91c3d42 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis {
/// Checks if the opcode is supported as the main opcode for copyable
/// elements.
static bool isSupportedOpcode(const unsigned Opcode) {
- return Opcode == Instruction::Add || Opcode == Instruction::LShr;
+ return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
+ Opcode == Instruction::Shl;
}
/// Identifies the best candidate value, which represents main opcode
@@ -10937,6 +10938,7 @@ public:
switch (MainOpcode) {
case Instruction::Add:
case Instruction::LShr:
+ case Instruction::Shl:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
@@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
+ if (E.isCopyableElement(V))
+ return true;
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
return AmtKnownBits.getMaxValue().ult(BitWidth);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 07b191a..2555ebe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
}
InstructionCost VPCostContext::getScalarizationOverhead(
- Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+ Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+ bool AlwaysIncludeReplicatingR) {
if (VF.isScalar())
return 0;
@@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
SmallPtrSet<const VPValue *, 4> UniqueOperands;
SmallVector<Type *> Tys;
for (auto *Op : Operands) {
- if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ if (Op->isLiveIn() ||
+ (!AlwaysIncludeReplicatingR &&
+ isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
+ (isa<VPReplicateRecipe>(Op) &&
+ cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
!UniqueOperands.insert(Op).second)
continue;
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fc1a09e..1580a3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
+ ScalarEvolution &SE;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
- TargetTransformInfo::TargetCostKind CostKind)
+ TargetTransformInfo::TargetCostKind CostKind,
+ ScalarEvolution &SE)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
- CostKind(CostKind) {}
+ CostKind(CostKind), SE(SE) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
/// and \p Operands with \p VF. This is a convenience wrapper for the
- /// type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Type *ResultTy,
- ArrayRef<const VPValue *> Operands,
- ElementCount VF);
+ /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+ /// is true, always compute the cost of scalarizing replicating operands.
+ InstructionCost
+ getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+ ElementCount VF,
+ bool AlwaysIncludeReplicatingR = false);
};
/// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 67b9244..94e2628 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -40,6 +40,7 @@
#include <cassert>
using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
using VectorParts = SmallVector<Value *, 2>;
@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
VPRecipeBase *OpR = Op->getDefiningRecipe();
// If the partial reduction is predicated, a select will be operand 0
- using namespace llvm::VPlanPatternMatch;
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
OpR = Op->getDefiningRecipe();
}
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
VPValue *Op0, *Op1;
- using namespace llvm::VPlanPatternMatch;
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -2778,7 +2777,7 @@ VPExpressionRecipe::VPExpressionRecipe(
// Recipes in the expression, except the last one, must only be used by
// (other) recipes inside the expression. If there are other users, external
// to the expression, use a clone of the recipe for external users.
- for (VPSingleDefRecipe *R : ExpressionRecipes) {
+ for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
if (R != ExpressionRecipes.back() &&
any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
return !ExpressionRecipesAsSetOfUsers.contains(U);
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
});
}
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+ auto *PtrR = Ptr->getDefiningRecipe();
+ if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+ cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+ Instruction::GetElementPtr) ||
+ isa<VPWidenGEPRecipe>(PtrR) ||
+ match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
+ return false;
+
+ // We are looking for a GEP where all indices are either loop invariant or
+ // inductions.
+ for (VPValue *Opd : drop_begin(PtrR->operands())) {
+ if (!Opd->isDefinedOutsideLoopRegions() &&
+ !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+ SmallPtrSet<const VPUser *, 4> Seen;
+ SmallVector<const VPUser *> WorkList = {V};
+
+ while (!WorkList.empty()) {
+ auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+ if (!Cur || !Seen.insert(Cur).second)
+ continue;
+
+ for (VPUser *U : Cur->users()) {
+ if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+ if (InterleaveR->getAddr() == Cur)
+ return true;
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+ if (RepR->getOpcode() == Instruction::Load &&
+ RepR->getOperand(0) == Cur)
+ return true;
+ if (RepR->getOpcode() == Instruction::Store &&
+ RepR->getOperand(1) == Cur)
+ return true;
+ }
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+ if (MemR->getAddr() == Cur && MemR->isConsecutive())
+ return true;
+ }
+ }
+
+ append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+ }
+ return false;
+}
+
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3218,21 +3273,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
}
case Instruction::Load:
case Instruction::Store: {
- if (isSingleScalar()) {
- bool IsLoad = UI->getOpcode() == Instruction::Load;
- Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
- Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
- const Align Alignment = getLoadStoreAlignment(UI);
- unsigned AS = getLoadStoreAddressSpace(UI);
- TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
- InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
- UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
- return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
- ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
- }
+ if (VF.isScalable() && !isSingleScalar())
+ return InstructionCost::getInvalid();
+
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- break;
+ const VPRegionBlock *ParentRegion = getParent()->getParent();
+ if (ParentRegion && ParentRegion->isReplicator())
+ break;
+
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ const VPValue *PtrOp = getOperand(!IsLoad);
+ // TODO: Handle cases where we need to pass a SCEV to
+ // getAddressComputationCost.
+ if (shouldUseAddressAccessSCEV(PtrOp))
+ break;
+
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+ Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+ bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+ bool UsedByLoadStoreAddress =
+ !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
+ InstructionCost ScalarCost =
+ ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
+ nullptr, Ctx.CostKind);
+ if (isSingleScalar())
+ return ScalarCost;
+
+ SmallVector<const VPValue *> OpsToScalarize;
+ Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+ // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+ // don't assign scalarization overhead in general, if the target prefers
+ // vectorized addressing or the loaded value is used as part of an address
+ // of another load or store.
+ if (!UsedByLoadStoreAddress) {
+ bool EfficientVectorLoadStore =
+ Ctx.TTI.supportsEfficientVectorElementLoadStore();
+ if (!(IsLoad && !PreferVectorizedAddressing) &&
+ !(!IsLoad && EfficientVectorLoadStore))
+ append_range(OpsToScalarize, operands());
+
+ if (!EfficientVectorLoadStore)
+ ResultTy = Ctx.Types.inferScalarType(this);
+ }
+
+ return (ScalarCost * VF.getFixedValue()) +
+ Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ca63bf3..ebf833e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4198,3 +4198,202 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
+
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPInstruction *addResumePhiRecipeForInduction(
+ VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
+ VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ // Truncated wide inductions resume from the last lane of their vector value
+ // in the last vector iteration which is handled elsewhere.
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *EndValue = VectorTC;
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ EndValue = VectorPHBuilder.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, VectorTC, Step);
+ }
+
+ // EndValue is derived from the vector trip count (which has the same type as
+ // the widest induction) and thus may be wider than the induction here.
+ Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+ EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+ ScalarTypeOfWideIV,
+ WideIV->getDebugLoc());
+ }
+
+ auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
+ {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
+ return ResumePhiRecipe;
+}
+
+void VPlanTransforms::addScalarResumePhis(
+ VPlan &Plan, VPRecipeBuilder &Builder,
+ DenseMap<VPValue *, VPValue *> &IVEndValues) {
+ VPTypeAnalysis TypeInfo(Plan);
+ auto *ScalarPH = Plan.getScalarPreheader();
+ auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+ VPBuilder VectorPHBuilder(
+ cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
+ VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+ VPBuilder ScalarPHBuilder(ScalarPH);
+ for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
+ auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
+
+ // TODO: Extract final value from induction recipe initially, optimize to
+ // pre-computed end value together in optimizeInductionExitUsers.
+ auto *VectorPhiR =
+ cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
+ if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+ if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
+ WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+ &Plan.getVectorTripCount())) {
+ assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
+ IVEndValues[WideIVR] = ResumePhi->getOperand(0);
+ ScalarPhiIRI->addOperand(ResumePhi);
+ continue;
+ }
+ // TODO: Also handle truncated inductions here. Computing end-values
+ // separately should be done as VPlan-to-VPlan optimization, after
+ // legalizing all resume values to use the last lane from the loop.
+ assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+ "should only skip truncated wide inductions");
+ continue;
+ }
+
+ // The backedge value provides the value to resume coming out of a loop,
+ // which for FORs is a vector whose last element needs to be extracted. The
+ // start value provides the value if the loop is bypassed.
+ bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
+ auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
+ assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+ "Cannot handle loops with uncountable early exits");
+ if (IsFOR)
+ ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
+ "vector.recur.extract");
+ StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
+ auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
+ {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
+ ScalarPhiIRI->addOperand(ResumePhiR);
+ }
+}
+
+void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
+ VFRange &Range) {
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+ auto *ScalarPHVPBB = Plan.getScalarPreheader();
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+
+ auto IsScalableOne = [](ElementCount VF) -> bool {
+ return VF == ElementCount::getScalable(1);
+ };
+
+ for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
+ auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
+ if (!FOR)
+ continue;
+
+ assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+ "Cannot handle loops with uncountable early exits");
+
+ // This is the second phase of vectorizing first-order recurrences, creating
+ // extract for users outside the loop. An overview of the transformation is
+ // described below. Suppose we have the following loop with some use after
+ // the loop of the last a[i-1],
+ //
+ // for (int i = 0; i < n; ++i) {
+ // t = a[i - 1];
+ // b[i] = a[i] - t;
+ // }
+ // use t;
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s.init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // use = lcssa.phi [s1, scalar.body]
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
+ // for users in the scalar preheader and exit block.
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3]
+ // b[i] = v2 - v1
+ // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v1
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // vector.recur.extract.for.phi = v2(2)
+ // vector.recur.extract = v2(3)
+ // br cond, scalar.ph, exit.block
+ //
+ // scalar.ph:
+ // scalar.recur.init = phi [vector.recur.extract, middle.block],
+ // [s.init, otherwise]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // lo = lcssa.phi [s1, scalar.body],
+ // [vector.recur.extract.for.phi, middle.block]
+ //
+ // Now update VPIRInstructions modeling LCSSA phis in the exit block.
+ // Extract the penultimate value of the recurrence and use it as operand for
+ // the VPIRInstruction modeling the phi.
+ for (VPUser *U : FOR->users()) {
+ using namespace llvm::VPlanPatternMatch;
+ if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
+ continue;
+ // For VF vscale x 1, if vscale = 1, we are unable to extract the
+ // penultimate value of the recurrence. Instead we rely on the existing
+ // extract of the last element from the result of
+ // VPInstruction::FirstOrderRecurrenceSplice.
+ // TODO: Consider vscale_range info and UF.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
+ Range))
+ return;
+ VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
+ {}, "vector.recur.extract.for.phi");
+ cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 2f00e51..5a8a2bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -363,6 +363,19 @@ struct VPlanTransforms {
static void
addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
std::optional<unsigned> VScaleForTuning);
+
+ /// Create resume phis in the scalar preheader for first-order recurrences,
+ /// reductions and inductions, and update the VPIRInstructions wrapping the
+ /// original phis in the scalar header. End values for inductions are added to
+ /// \p IVEndValues.
+ static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder,
+ DenseMap<VPValue *, VPValue *> &IVEndValues);
+
+ /// Handle users in the exit block for first order reductions in the original
+ /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
+ /// users in the original exit block using the VPIRInstruction wrapping to the
+ /// LCSSA phi.
+ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
};
} // namespace llvm