aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp119
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp9
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp39
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp149
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp215
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp199
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h13
8 files changed, 461 insertions, 288 deletions
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 8d9a0e7..50130da 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -2067,6 +2067,36 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
AI.run(SCCNodes, Changed);
}
+// Determines if the function 'F' can be marked 'norecurse'.
+// It returns true if any call within 'F' could lead to a recursive
+// call back to 'F', and false otherwise.
+// The 'AnyFunctionsAddressIsTaken' parameter is a module-wide flag
+// that is true if any function's address is taken, or if any function
+// has external linkage. This is used to determine the safety of
+// external/library calls.
+static bool mayHaveRecursiveCallee(Function &F,
+ bool AnyFunctionsAddressIsTaken = true) {
+ for (const auto &BB : F) {
+ for (const auto &I : BB.instructionsWithoutDebug()) {
+ if (const auto *CB = dyn_cast<CallBase>(&I)) {
+ const Function *Callee = CB->getCalledFunction();
+ if (!Callee || Callee == &F)
+ return true;
+
+ if (Callee->doesNotRecurse())
+ continue;
+
+ if (!AnyFunctionsAddressIsTaken ||
+ (Callee->isDeclaration() &&
+ Callee->hasFnAttribute(Attribute::NoCallback)))
+ continue;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
SmallPtrSet<Function *, 8> &Changed) {
// Try and identify functions that do not recurse.
@@ -2078,28 +2108,14 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
Function *F = *SCCNodes.begin();
if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
return;
-
- // If all of the calls in F are identifiable and are to norecurse functions, F
- // is norecurse. This check also detects self-recursion as F is not currently
- // marked norecurse, so any called from F to F will not be marked norecurse.
- for (auto &BB : *F)
- for (auto &I : BB.instructionsWithoutDebug())
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- Function *Callee = CB->getCalledFunction();
- if (!Callee || Callee == F ||
- (!Callee->doesNotRecurse() &&
- !(Callee->isDeclaration() &&
- Callee->hasFnAttribute(Attribute::NoCallback))))
- // Function calls a potentially recursive function.
- return;
- }
-
- // Every call was to a non-recursive function other than this function, and
- // we have no indirect recursion as the SCC size is one. This function cannot
- // recurse.
- F->setDoesNotRecurse();
- ++NumNoRecurse;
- Changed.insert(F);
+ if (!mayHaveRecursiveCallee(*F)) {
+ // Every call was to a non-recursive function other than this function, and
+ // we have no indirect recursion as the SCC size is one. This function
+ // cannot recurse.
+ F->setDoesNotRecurse();
+ ++NumNoRecurse;
+ Changed.insert(F);
+ }
}
// Set the noreturn function attribute if possible.
@@ -2429,3 +2445,62 @@ ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
PA.preserve<LazyCallGraphAnalysis>();
return PA;
}
+
+PreservedAnalyses NoRecurseLTOInferencePass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+
+ // Check if any function in the whole program has its address taken or has
+ // potentially external linkage.
+ // We use this information when inferring norecurse attribute: If there is
+ // no function whose address is taken and all functions have internal
+ // linkage, there is no path for a callback to any user function.
+ bool AnyFunctionsAddressIsTaken = false;
+ for (Function &F : M) {
+ if (F.isDeclaration() || F.doesNotRecurse())
+ continue;
+ if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
+ AnyFunctionsAddressIsTaken = true;
+ break;
+ }
+ }
+
+ // Run norecurse inference on all RefSCCs in the LazyCallGraph for this
+ // module.
+ bool Changed = false;
+ LazyCallGraph &CG = MAM.getResult<LazyCallGraphAnalysis>(M);
+ CG.buildRefSCCs();
+
+ for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
+ // Skip any RefSCC that is part of a call cycle. A RefSCC containing more
+ // than one SCC indicates a recursive relationship involving indirect calls.
+ if (RC.size() > 1)
+ continue;
+
+ // RefSCC contains a single-SCC. SCC size > 1 indicates mutually recursive
+ // functions. Ex: foo1 -> foo2 -> foo3 -> foo1.
+ LazyCallGraph::SCC &S = *RC.begin();
+ if (S.size() > 1)
+ continue;
+
+ // Get the single function from this SCC.
+ Function &F = S.begin()->getFunction();
+ if (!F.hasExactDefinition() || F.doesNotRecurse())
+ continue;
+
+ // If the analysis confirms that this function has no recursive calls
+ // (either direct, indirect, or through external linkages),
+ // we can safely apply the norecurse attribute.
+ if (!mayHaveRecursiveCallee(F, AnyFunctionsAddressIsTaken)) {
+ F.setDoesNotRecurse();
+ ++NumNoRecurse;
+ Changed = true;
+ }
+ }
+
+ PreservedAnalyses PA;
+ if (Changed)
+ PA.preserve<LazyCallGraphAnalysis>();
+ else
+ PA = PreservedAnalyses::all();
+ return PA;
+}
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index faeab95..cfdfd94 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3986,6 +3986,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
void ModuleCallsiteContextGraph::updateAllocationCall(
CallInfo &Call, AllocationType AllocType) {
std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+ removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
"memprof", AllocTypeString);
cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5661,9 +5662,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
// Include allocs that were already assigned a memprof function
- // attribute in the statistics.
- if (CB->getAttributes().hasFnAttr("memprof")) {
- assert(!MemProfMD);
+ // attribute in the statistics. Only do this for those that do not have
+ // memprof metadata, since we add an "ambiguous" memprof attribute by
+ // default.
+ if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
? AllocTypeColdThinBackend++
: AllocTypeNotColdThinBackend++;
@@ -5740,6 +5742,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// clone J-1 (J==0 is the original clone and does not have a VMaps
// entry).
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+ removeAnyExistingAmbiguousAttribute(CBClone);
CBClone->addFnAttr(A);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
<< ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index af216cd..9693ae6 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -317,24 +317,29 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
// Early exit if we know nothing about X.
if (LRange.isFullSet())
return nullptr;
- // We are allowed to refine the comparison to either true or false for out
- // of range inputs. Here we refine the comparison to true, i.e. we relax
- // the range check.
- auto NewCR = CR->exactUnionWith(LRange.inverse());
- // TODO: Check if we can narrow the range check to an equality test.
- // E.g, for X in [0, 4), X - 3 u< 2 -> X == 3
- if (!NewCR)
+ auto ConvertCRToICmp =
+ [&](const std::optional<ConstantRange> &NewCR) -> Value * {
+ ICmpInst::Predicate Pred;
+ APInt RHS;
+ // Check if we can represent NewCR as an icmp predicate.
+ if (NewCR && NewCR->getEquivalentICmp(Pred, RHS)) {
+ IRBuilder<NoFolder> Builder(&Inst);
+ Value *NewICmp =
+ Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
+ InsertedValues.insert(NewICmp);
+ return NewICmp;
+ }
return nullptr;
- ICmpInst::Predicate Pred;
- APInt RHS;
- // Check if we can represent NewCR as an icmp predicate.
- if (NewCR->getEquivalentICmp(Pred, RHS)) {
- IRBuilder<NoFolder> Builder(&Inst);
- Value *NewICmp =
- Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
- InsertedValues.insert(NewICmp);
- return NewICmp;
- }
+ };
+ // We are allowed to refine the comparison to either true or false for out
+ // of range inputs.
+ // Here we refine the comparison to false, and check if we can narrow the
+ // range check to a simpler test.
+ if (auto *V = ConvertCRToICmp(CR->exactIntersectWith(LRange)))
+ return V;
+ // Here we refine the comparison to true, i.e. we relax the range check.
+ if (auto *V = ConvertCRToICmp(CR->exactUnionWith(LRange.inverse())))
+ return V;
}
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 48055ad..148bfa8 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5734,15 +5734,66 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
return Changed;
}
-static bool casesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+struct ContiguousCasesResult {
+ ConstantInt *Min;
+ ConstantInt *Max;
+ BasicBlock *Dest;
+ BasicBlock *OtherDest;
+ SmallVectorImpl<ConstantInt *> *Cases;
+ SmallVectorImpl<ConstantInt *> *OtherCases;
+};
+
+static std::optional<ContiguousCasesResult>
+findContiguousCases(Value *Condition, SmallVectorImpl<ConstantInt *> &Cases,
+ SmallVectorImpl<ConstantInt *> &OtherCases,
+ BasicBlock *Dest, BasicBlock *OtherDest) {
assert(Cases.size() >= 1);
array_pod_sort(Cases.begin(), Cases.end(), constantIntSortPredicate);
- for (size_t I = 1, E = Cases.size(); I != E; ++I) {
- if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
- return false;
+ const APInt &Min = Cases.back()->getValue();
+ const APInt &Max = Cases.front()->getValue();
+ APInt Offset = Max - Min;
+ size_t ContiguousOffset = Cases.size() - 1;
+ if (Offset == ContiguousOffset) {
+ return ContiguousCasesResult{
+ /*Min=*/Cases.back(),
+ /*Max=*/Cases.front(),
+ /*Dest=*/Dest,
+ /*OtherDest=*/OtherDest,
+ /*Cases=*/&Cases,
+ /*OtherCases=*/&OtherCases,
+ };
}
- return true;
+ ConstantRange CR = computeConstantRange(Condition, /*ForSigned=*/false);
+ // If this is a wrapping contiguous range, that is, [Min, OtherMin] +
+ // [OtherMax, Max] (also [OtherMax, OtherMin]), [OtherMin+1, OtherMax-1] is a
+ // contiguous range for the other destination. N.B. If CR is not a full range,
+ // Max+1 is not equal to Min. It's not continuous in arithmetic.
+ if (Max == CR.getUnsignedMax() && Min == CR.getUnsignedMin()) {
+ assert(Cases.size() >= 2);
+ auto *It =
+ std::adjacent_find(Cases.begin(), Cases.end(), [](auto L, auto R) {
+ return L->getValue() != R->getValue() + 1;
+ });
+ if (It == Cases.end())
+ return std::nullopt;
+ auto [OtherMax, OtherMin] = std::make_pair(*It, *std::next(It));
+ if ((Max - OtherMax->getValue()) + (OtherMin->getValue() - Min) ==
+ Cases.size() - 2) {
+ return ContiguousCasesResult{
+ /*Min=*/cast<ConstantInt>(
+ ConstantInt::get(OtherMin->getType(), OtherMin->getValue() + 1)),
+ /*Max=*/
+ cast<ConstantInt>(
+ ConstantInt::get(OtherMax->getType(), OtherMax->getValue() - 1)),
+ /*Dest=*/OtherDest,
+ /*OtherDest=*/Dest,
+ /*Cases=*/&OtherCases,
+ /*OtherCases=*/&Cases,
+ };
+ }
+ }
+ return std::nullopt;
}
static void createUnreachableSwitchDefault(SwitchInst *Switch,
@@ -5779,7 +5830,6 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
bool HasDefault = !SI->defaultDestUnreachable();
auto *BB = SI->getParent();
-
// Partition the cases into two sets with different destinations.
BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
BasicBlock *DestB = nullptr;
@@ -5813,37 +5863,62 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
assert(!CasesA.empty() || HasDefault);
// Figure out if one of the sets of cases form a contiguous range.
- SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
- BasicBlock *ContiguousDest = nullptr;
- BasicBlock *OtherDest = nullptr;
- if (!CasesA.empty() && casesAreContiguous(CasesA)) {
- ContiguousCases = &CasesA;
- ContiguousDest = DestA;
- OtherDest = DestB;
- } else if (casesAreContiguous(CasesB)) {
- ContiguousCases = &CasesB;
- ContiguousDest = DestB;
- OtherDest = DestA;
- } else
- return false;
+ std::optional<ContiguousCasesResult> ContiguousCases;
+
+ // Only one icmp is needed when there is only one case.
+ if (!HasDefault && CasesA.size() == 1)
+ ContiguousCases = ContiguousCasesResult{
+ /*Min=*/CasesA[0],
+ /*Max=*/CasesA[0],
+ /*Dest=*/DestA,
+ /*OtherDest=*/DestB,
+ /*Cases=*/&CasesA,
+ /*OtherCases=*/&CasesB,
+ };
+ else if (CasesB.size() == 1)
+ ContiguousCases = ContiguousCasesResult{
+ /*Min=*/CasesB[0],
+ /*Max=*/CasesB[0],
+ /*Dest=*/DestB,
+ /*OtherDest=*/DestA,
+ /*Cases=*/&CasesB,
+ /*OtherCases=*/&CasesA,
+ };
+ // Correctness: Cases to the default destination cannot be contiguous cases.
+ else if (!HasDefault)
+ ContiguousCases =
+ findContiguousCases(SI->getCondition(), CasesA, CasesB, DestA, DestB);
- // Start building the compare and branch.
+ if (!ContiguousCases)
+ ContiguousCases =
+ findContiguousCases(SI->getCondition(), CasesB, CasesA, DestB, DestA);
- Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
- Constant *NumCases =
- ConstantInt::get(Offset->getType(), ContiguousCases->size());
+ if (!ContiguousCases)
+ return false;
- Value *Sub = SI->getCondition();
- if (!Offset->isNullValue())
- Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+ auto [Min, Max, Dest, OtherDest, Cases, OtherCases] = *ContiguousCases;
- Value *Cmp;
+ // Start building the compare and branch.
+
+ Constant *Offset = ConstantExpr::getNeg(Min);
+ Constant *NumCases = ConstantInt::get(Offset->getType(),
+ Max->getValue() - Min->getValue() + 1);
+ BranchInst *NewBI;
+ if (NumCases->isOneValue()) {
+ assert(Max->getValue() == Min->getValue());
+ Value *Cmp = Builder.CreateICmpEQ(SI->getCondition(), Min);
+ NewBI = Builder.CreateCondBr(Cmp, Dest, OtherDest);
+ }
// If NumCases overflowed, then all possible values jump to the successor.
- if (NumCases->isNullValue() && !ContiguousCases->empty())
- Cmp = ConstantInt::getTrue(SI->getContext());
- else
- Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
- BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
+ else if (NumCases->isNullValue() && !Cases->empty()) {
+ NewBI = Builder.CreateBr(Dest);
+ } else {
+ Value *Sub = SI->getCondition();
+ if (!Offset->isNullValue())
+ Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+ Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+ NewBI = Builder.CreateCondBr(Cmp, Dest, OtherDest);
+ }
// Update weight for the newly-created conditional branch.
if (hasBranchWeightMD(*SI)) {
@@ -5853,7 +5928,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
uint64_t TrueWeight = 0;
uint64_t FalseWeight = 0;
for (size_t I = 0, E = Weights.size(); I != E; ++I) {
- if (SI->getSuccessor(I) == ContiguousDest)
+ if (SI->getSuccessor(I) == Dest)
TrueWeight += Weights[I];
else
FalseWeight += Weights[I];
@@ -5868,15 +5943,15 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
}
// Prune obsolete incoming values off the successors' PHI nodes.
- for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
- unsigned PreviousEdges = ContiguousCases->size();
- if (ContiguousDest == SI->getDefaultDest())
+ for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) {
+ unsigned PreviousEdges = Cases->size();
+ if (Dest == SI->getDefaultDest())
++PreviousEdges;
for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
}
for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
- unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+ unsigned PreviousEdges = OtherCases->size();
if (OtherDest == SI->getDefaultDest())
++PreviousEdges;
for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56a3d6d..d393a9c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8201,211 +8201,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
}
}
-/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
-/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
-/// the end value of the induction.
-static VPInstruction *addResumePhiRecipeForInduction(
- VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
- VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
- auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
- // Truncated wide inductions resume from the last lane of their vector value
- // in the last vector iteration which is handled elsewhere.
- if (WideIntOrFp && WideIntOrFp->getTruncInst())
- return nullptr;
-
- VPValue *Start = WideIV->getStartValue();
- VPValue *Step = WideIV->getStepValue();
- const InductionDescriptor &ID = WideIV->getInductionDescriptor();
- VPValue *EndValue = VectorTC;
- if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
- EndValue = VectorPHBuilder.createDerivedIV(
- ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
- Start, VectorTC, Step);
- }
-
- // EndValue is derived from the vector trip count (which has the same type as
- // the widest induction) and thus may be wider than the induction here.
- Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
- if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
- EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
- ScalarTypeOfWideIV,
- WideIV->getDebugLoc());
- }
-
- auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
- {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
- return ResumePhiRecipe;
-}
-
-/// Create resume phis in the scalar preheader for first-order recurrences,
-/// reductions and inductions, and update the VPIRInstructions wrapping the
-/// original phis in the scalar header. End values for inductions are added to
-/// \p IVEndValues.
-static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
- DenseMap<VPValue *, VPValue *> &IVEndValues) {
- VPTypeAnalysis TypeInfo(Plan);
- auto *ScalarPH = Plan.getScalarPreheader();
- auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
- VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
- VPBuilder VectorPHBuilder(
- cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
- VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
- VPBuilder ScalarPHBuilder(ScalarPH);
- for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
- auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
-
- // TODO: Extract final value from induction recipe initially, optimize to
- // pre-computed end value together in optimizeInductionExitUsers.
- auto *VectorPhiR =
- cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
- if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
- if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
- WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
- &Plan.getVectorTripCount())) {
- assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
- IVEndValues[WideIVR] = ResumePhi->getOperand(0);
- ScalarPhiIRI->addOperand(ResumePhi);
- continue;
- }
- // TODO: Also handle truncated inductions here. Computing end-values
- // separately should be done as VPlan-to-VPlan optimization, after
- // legalizing all resume values to use the last lane from the loop.
- assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
- "should only skip truncated wide inductions");
- continue;
- }
-
- // The backedge value provides the value to resume coming out of a loop,
- // which for FORs is a vector whose last element needs to be extracted. The
- // start value provides the value if the loop is bypassed.
- bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
- auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
- assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
- "Cannot handle loops with uncountable early exits");
- if (IsFOR)
- ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
- "vector.recur.extract");
- StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
- auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
- {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
- ScalarPhiIRI->addOperand(ResumePhiR);
- }
-}
-
-/// Handle users in the exit block for first order reductions in the original
-/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
-/// users in the original exit block using the VPIRInstruction wrapping to the
-/// LCSSA phi.
-static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
- VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
- auto *ScalarPHVPBB = Plan.getScalarPreheader();
- auto *MiddleVPBB = Plan.getMiddleBlock();
- VPBuilder ScalarPHBuilder(ScalarPHVPBB);
- VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-
- auto IsScalableOne = [](ElementCount VF) -> bool {
- return VF == ElementCount::getScalable(1);
- };
-
- for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
- auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
- if (!FOR)
- continue;
-
- assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
- "Cannot handle loops with uncountable early exits");
-
- // This is the second phase of vectorizing first-order recurrences, creating
- // extract for users outside the loop. An overview of the transformation is
- // described below. Suppose we have the following loop with some use after
- // the loop of the last a[i-1],
- //
- // for (int i = 0; i < n; ++i) {
- // t = a[i - 1];
- // b[i] = a[i] - t;
- // }
- // use t;
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s.init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // use = lcssa.phi [s1, scalar.body]
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
- // for users in the scalar preheader and exit block.
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3]
- // b[i] = v2 - v1
- // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v1
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // vector.recur.extract.for.phi = v2(2)
- // vector.recur.extract = v2(3)
- // br cond, scalar.ph, exit.block
- //
- // scalar.ph:
- // scalar.recur.init = phi [vector.recur.extract, middle.block],
- // [s.init, otherwise]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // lo = lcssa.phi [s1, scalar.body],
- // [vector.recur.extract.for.phi, middle.block]
- //
- // Now update VPIRInstructions modeling LCSSA phis in the exit block.
- // Extract the penultimate value of the recurrence and use it as operand for
- // the VPIRInstruction modeling the phi.
- for (VPUser *U : FOR->users()) {
- using namespace llvm::VPlanPatternMatch;
- if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
- continue;
- // For VF vscale x 1, if vscale = 1, we are unable to extract the
- // penultimate value of the recurrence. Instead we rely on the existing
- // extract of the last element from the result of
- // VPInstruction::FirstOrderRecurrenceSplice.
- // TODO: Consider vscale_range info and UF.
- if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
- Range))
- return;
- VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
- {}, "vector.recur.extract.for.phi");
- cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
- }
- }
-}
-
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
@@ -8598,9 +8393,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
R->setOperand(1, WideIV->getStepValue());
}
- addExitUsersForFirstOrderRecurrences(*Plan, Range);
+ // TODO: We can't call runPass on these transforms yet, due to verifier
+ // failures.
+ VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range);
DenseMap<VPValue *, VPValue *> IVEndValues;
- addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
+ VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8711,7 +8508,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
DenseMap<VPValue *, VPValue *> IVEndValues;
// TODO: IVEndValues are not used yet in the native path, to optimize exit
// values.
- addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
+ // TODO: We can't call runPass on the transform yet, due to verifier
+ // failures.
+ VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fedca65..91c3d42 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis {
/// Checks if the opcode is supported as the main opcode for copyable
/// elements.
static bool isSupportedOpcode(const unsigned Opcode) {
- return Opcode == Instruction::Add || Opcode == Instruction::LShr;
+ return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
+ Opcode == Instruction::Shl;
}
/// Identifies the best candidate value, which represents main opcode
@@ -10937,6 +10938,7 @@ public:
switch (MainOpcode) {
case Instruction::Add:
case Instruction::LShr:
+ case Instruction::Shl:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
@@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
+ if (E.isCopyableElement(V))
+ return true;
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
return AmtKnownBits.getMaxValue().ult(BitWidth);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ca63bf3..ebf833e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4198,3 +4198,202 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
+
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPInstruction *addResumePhiRecipeForInduction(
+ VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
+ VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ // Truncated wide inductions resume from the last lane of their vector value
+ // in the last vector iteration which is handled elsewhere.
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *EndValue = VectorTC;
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ EndValue = VectorPHBuilder.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, VectorTC, Step);
+ }
+
+ // EndValue is derived from the vector trip count (which has the same type as
+ // the widest induction) and thus may be wider than the induction here.
+ Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+ EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+ ScalarTypeOfWideIV,
+ WideIV->getDebugLoc());
+ }
+
+ auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
+ {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
+ return ResumePhiRecipe;
+}
+
+void VPlanTransforms::addScalarResumePhis(
+ VPlan &Plan, VPRecipeBuilder &Builder,
+ DenseMap<VPValue *, VPValue *> &IVEndValues) {
+ VPTypeAnalysis TypeInfo(Plan);
+ auto *ScalarPH = Plan.getScalarPreheader();
+ auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+ VPBuilder VectorPHBuilder(
+ cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
+ VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+ VPBuilder ScalarPHBuilder(ScalarPH);
+ for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
+ auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
+
+ // TODO: Extract final value from induction recipe initially, optimize to
+ // pre-computed end value together in optimizeInductionExitUsers.
+ auto *VectorPhiR =
+ cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
+ if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+ if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
+ WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+ &Plan.getVectorTripCount())) {
+ assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
+ IVEndValues[WideIVR] = ResumePhi->getOperand(0);
+ ScalarPhiIRI->addOperand(ResumePhi);
+ continue;
+ }
+ // TODO: Also handle truncated inductions here. Computing end-values
+ // separately should be done as VPlan-to-VPlan optimization, after
+ // legalizing all resume values to use the last lane from the loop.
+ assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+ "should only skip truncated wide inductions");
+ continue;
+ }
+
+ // The backedge value provides the value to resume coming out of a loop,
+ // which for FORs is a vector whose last element needs to be extracted. The
+ // start value provides the value if the loop is bypassed.
+ bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
+ auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
+ assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+ "Cannot handle loops with uncountable early exits");
+ if (IsFOR)
+ ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
+ "vector.recur.extract");
+ StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
+ auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
+ {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
+ ScalarPhiIRI->addOperand(ResumePhiR);
+ }
+}
+
+void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
+ VFRange &Range) {
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+ auto *ScalarPHVPBB = Plan.getScalarPreheader();
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+
+ auto IsScalableOne = [](ElementCount VF) -> bool {
+ return VF == ElementCount::getScalable(1);
+ };
+
+ for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
+ auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
+ if (!FOR)
+ continue;
+
+ assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+ "Cannot handle loops with uncountable early exits");
+
+ // This is the second phase of vectorizing first-order recurrences, creating
+ // extract for users outside the loop. An overview of the transformation is
+ // described below. Suppose we have the following loop with some use after
+ // the loop of the last a[i-1],
+ //
+ // for (int i = 0; i < n; ++i) {
+ // t = a[i - 1];
+ // b[i] = a[i] - t;
+ // }
+ // use t;
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s.init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // use = lcssa.phi [s1, scalar.body]
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
+ // for users in the scalar preheader and exit block.
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3]
+ // b[i] = v2 - v1
+ // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v1
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // vector.recur.extract.for.phi = v2(2)
+ // vector.recur.extract = v2(3)
+ // br cond, scalar.ph, exit.block
+ //
+ // scalar.ph:
+ // scalar.recur.init = phi [vector.recur.extract, middle.block],
+ // [s.init, otherwise]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // lo = lcssa.phi [s1, scalar.body],
+ // [vector.recur.extract.for.phi, middle.block]
+ //
+ // Now update VPIRInstructions modeling LCSSA phis in the exit block.
+ // Extract the penultimate value of the recurrence and use it as operand for
+ // the VPIRInstruction modeling the phi.
+ for (VPUser *U : FOR->users()) {
+ using namespace llvm::VPlanPatternMatch;
+ if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
+ continue;
+ // For VF vscale x 1, if vscale = 1, we are unable to extract the
+ // penultimate value of the recurrence. Instead we rely on the existing
+ // extract of the last element from the result of
+ // VPInstruction::FirstOrderRecurrenceSplice.
+ // TODO: Consider vscale_range info and UF.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
+ Range))
+ return;
+ VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
+ {}, "vector.recur.extract.for.phi");
+ cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 2f00e51..5a8a2bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -363,6 +363,19 @@ struct VPlanTransforms {
static void
addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
std::optional<unsigned> VScaleForTuning);
+
+ /// Create resume phis in the scalar preheader for first-order recurrences,
+ /// reductions and inductions, and update the VPIRInstructions wrapping the
+ /// original phis in the scalar header. End values for inductions are added to
+ /// \p IVEndValues.
+ static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder,
+ DenseMap<VPValue *, VPValue *> &IVEndValues);
+
+ /// Handle users in the exit block for first order reductions in the original
+ /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
+ /// users in the original exit block using the VPIRInstruction wrapping to the
+ /// LCSSA phi.
+ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
};
} // namespace llvm