diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SCCPSolver.cpp | 39 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 149 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 214 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 199 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 13 |
7 files changed, 363 insertions, 266 deletions
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index faeab95..cfdfd94 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -3986,6 +3986,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( void ModuleCallsiteContextGraph::updateAllocationCall( CallInfo &Call, AllocationType AllocType) { std::string AllocTypeString = getAllocTypeAttributeString(AllocType); + removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call())); auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), "memprof", AllocTypeString); cast<CallBase>(Call.call())->addFnAttr(A); @@ -5661,9 +5662,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); // Include allocs that were already assigned a memprof function - // attribute in the statistics. - if (CB->getAttributes().hasFnAttr("memprof")) { - assert(!MemProfMD); + // attribute in the statistics. Only do this for those that do not have + // memprof metadata, since we add an "ambiguous" memprof attribute by + // default. + if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) { CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" ? AllocTypeColdThinBackend++ : AllocTypeNotColdThinBackend++; @@ -5740,6 +5742,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // clone J-1 (J==0 is the original clone and does not have a VMaps // entry). CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); + removeAnyExistingAmbiguousAttribute(CBClone); CBClone->addFnAttr(A); ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) << ore::NV("AllocationCall", CBClone) << " in clone " diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index af216cd..9693ae6 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -317,24 +317,29 @@ static Value *simplifyInstruction(SCCPSolver &Solver, // Early exit if we know nothing about X. if (LRange.isFullSet()) return nullptr; - // We are allowed to refine the comparison to either true or false for out - // of range inputs. Here we refine the comparison to true, i.e. we relax - // the range check. - auto NewCR = CR->exactUnionWith(LRange.inverse()); - // TODO: Check if we can narrow the range check to an equality test. - // E.g, for X in [0, 4), X - 3 u< 2 -> X == 3 - if (!NewCR) + auto ConvertCRToICmp = + [&](const std::optional<ConstantRange> &NewCR) -> Value * { + ICmpInst::Predicate Pred; + APInt RHS; + // Check if we can represent NewCR as an icmp predicate. + if (NewCR && NewCR->getEquivalentICmp(Pred, RHS)) { + IRBuilder<NoFolder> Builder(&Inst); + Value *NewICmp = + Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS)); + InsertedValues.insert(NewICmp); + return NewICmp; + } return nullptr; - ICmpInst::Predicate Pred; - APInt RHS; - // Check if we can represent NewCR as an icmp predicate. - if (NewCR->getEquivalentICmp(Pred, RHS)) { - IRBuilder<NoFolder> Builder(&Inst); - Value *NewICmp = - Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS)); - InsertedValues.insert(NewICmp); - return NewICmp; - } + }; + // We are allowed to refine the comparison to either true or false for out + // of range inputs. + // Here we refine the comparison to false, and check if we can narrow the + // range check to a simpler test. + if (auto *V = ConvertCRToICmp(CR->exactIntersectWith(LRange))) + return V; + // Here we refine the comparison to true, i.e. we relax the range check. + if (auto *V = ConvertCRToICmp(CR->exactUnionWith(LRange.inverse()))) + return V; } } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 48055ad..148bfa8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5734,15 +5734,66 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) { return Changed; } -static bool casesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { +struct ContiguousCasesResult { + ConstantInt *Min; + ConstantInt *Max; + BasicBlock *Dest; + BasicBlock *OtherDest; + SmallVectorImpl<ConstantInt *> *Cases; + SmallVectorImpl<ConstantInt *> *OtherCases; +}; + +static std::optional<ContiguousCasesResult> +findContiguousCases(Value *Condition, SmallVectorImpl<ConstantInt *> &Cases, + SmallVectorImpl<ConstantInt *> &OtherCases, + BasicBlock *Dest, BasicBlock *OtherDest) { assert(Cases.size() >= 1); array_pod_sort(Cases.begin(), Cases.end(), constantIntSortPredicate); - for (size_t I = 1, E = Cases.size(); I != E; ++I) { - if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1) - return false; + const APInt &Min = Cases.back()->getValue(); + const APInt &Max = Cases.front()->getValue(); + APInt Offset = Max - Min; + size_t ContiguousOffset = Cases.size() - 1; + if (Offset == ContiguousOffset) { + return ContiguousCasesResult{ + /*Min=*/Cases.back(), + /*Max=*/Cases.front(), + /*Dest=*/Dest, + /*OtherDest=*/OtherDest, + /*Cases=*/&Cases, + /*OtherCases=*/&OtherCases, + }; } - return true; + ConstantRange CR = computeConstantRange(Condition, /*ForSigned=*/false); + // If this is a wrapping contiguous range, that is, [Min, OtherMin] + + // [OtherMax, Max] (also [OtherMax, OtherMin]), [OtherMin+1, OtherMax-1] is a + // contiguous range for the other destination. N.B. If CR is not a full range, + // Max+1 is not equal to Min. It's not continuous in arithmetic. + if (Max == CR.getUnsignedMax() && Min == CR.getUnsignedMin()) { + assert(Cases.size() >= 2); + auto *It = + std::adjacent_find(Cases.begin(), Cases.end(), [](auto L, auto R) { + return L->getValue() != R->getValue() + 1; + }); + if (It == Cases.end()) + return std::nullopt; + auto [OtherMax, OtherMin] = std::make_pair(*It, *std::next(It)); + if ((Max - OtherMax->getValue()) + (OtherMin->getValue() - Min) == + Cases.size() - 2) { + return ContiguousCasesResult{ + /*Min=*/cast<ConstantInt>( + ConstantInt::get(OtherMin->getType(), OtherMin->getValue() + 1)), + /*Max=*/ + cast<ConstantInt>( + ConstantInt::get(OtherMax->getType(), OtherMax->getValue() - 1)), + /*Dest=*/OtherDest, + /*OtherDest=*/Dest, + /*Cases=*/&OtherCases, + /*OtherCases=*/&Cases, + }; + } + } + return std::nullopt; } static void createUnreachableSwitchDefault(SwitchInst *Switch, @@ -5779,7 +5830,6 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, bool HasDefault = !SI->defaultDestUnreachable(); auto *BB = SI->getParent(); - // Partition the cases into two sets with different destinations. BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr; BasicBlock *DestB = nullptr; @@ -5813,37 +5863,62 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, assert(!CasesA.empty() || HasDefault); // Figure out if one of the sets of cases form a contiguous range. - SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr; - BasicBlock *ContiguousDest = nullptr; - BasicBlock *OtherDest = nullptr; - if (!CasesA.empty() && casesAreContiguous(CasesA)) { - ContiguousCases = &CasesA; - ContiguousDest = DestA; - OtherDest = DestB; - } else if (casesAreContiguous(CasesB)) { - ContiguousCases = &CasesB; - ContiguousDest = DestB; - OtherDest = DestA; - } else - return false; + std::optional<ContiguousCasesResult> ContiguousCases; + + // Only one icmp is needed when there is only one case. + if (!HasDefault && CasesA.size() == 1) + ContiguousCases = ContiguousCasesResult{ + /*Min=*/CasesA[0], + /*Max=*/CasesA[0], + /*Dest=*/DestA, + /*OtherDest=*/DestB, + /*Cases=*/&CasesA, + /*OtherCases=*/&CasesB, + }; + else if (CasesB.size() == 1) + ContiguousCases = ContiguousCasesResult{ + /*Min=*/CasesB[0], + /*Max=*/CasesB[0], + /*Dest=*/DestB, + /*OtherDest=*/DestA, + /*Cases=*/&CasesB, + /*OtherCases=*/&CasesA, + }; + // Correctness: Cases to the default destination cannot be contiguous cases. + else if (!HasDefault) + ContiguousCases = + findContiguousCases(SI->getCondition(), CasesA, CasesB, DestA, DestB); - // Start building the compare and branch. + if (!ContiguousCases) + ContiguousCases = + findContiguousCases(SI->getCondition(), CasesB, CasesA, DestB, DestA); - Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back()); - Constant *NumCases = - ConstantInt::get(Offset->getType(), ContiguousCases->size()); + if (!ContiguousCases) + return false; - Value *Sub = SI->getCondition(); - if (!Offset->isNullValue()) - Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off"); + auto [Min, Max, Dest, OtherDest, Cases, OtherCases] = *ContiguousCases; - Value *Cmp; + // Start building the compare and branch. + + Constant *Offset = ConstantExpr::getNeg(Min); + Constant *NumCases = ConstantInt::get(Offset->getType(), + Max->getValue() - Min->getValue() + 1); + BranchInst *NewBI; + if (NumCases->isOneValue()) { + assert(Max->getValue() == Min->getValue()); + Value *Cmp = Builder.CreateICmpEQ(SI->getCondition(), Min); + NewBI = Builder.CreateCondBr(Cmp, Dest, OtherDest); + } // If NumCases overflowed, then all possible values jump to the successor. - if (NumCases->isNullValue() && !ContiguousCases->empty()) - Cmp = ConstantInt::getTrue(SI->getContext()); - else - Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); - BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest); + else if (NumCases->isNullValue() && !Cases->empty()) { + NewBI = Builder.CreateBr(Dest); + } else { + Value *Sub = SI->getCondition(); + if (!Offset->isNullValue()) + Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off"); + Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); + NewBI = Builder.CreateCondBr(Cmp, Dest, OtherDest); + } // Update weight for the newly-created conditional branch. if (hasBranchWeightMD(*SI)) { @@ -5853,7 +5928,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, uint64_t TrueWeight = 0; uint64_t FalseWeight = 0; for (size_t I = 0, E = Weights.size(); I != E; ++I) { - if (SI->getSuccessor(I) == ContiguousDest) + if (SI->getSuccessor(I) == Dest) TrueWeight += Weights[I]; else FalseWeight += Weights[I]; @@ -5868,15 +5943,15 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Prune obsolete incoming values off the successors' PHI nodes. - for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) { - unsigned PreviousEdges = ContiguousCases->size(); - if (ContiguousDest == SI->getDefaultDest()) + for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) { + unsigned PreviousEdges = Cases->size(); + if (Dest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); } for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { - unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size(); + unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 56a3d6d..e434e73 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8201,211 +8201,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the -/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute -/// the end value of the induction. -static VPInstruction *addResumePhiRecipeForInduction( - VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, - VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { - auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); - // Truncated wide inductions resume from the last lane of their vector value - // in the last vector iteration which is handled elsewhere. - if (WideIntOrFp && WideIntOrFp->getTruncInst()) - return nullptr; - - VPValue *Start = WideIV->getStartValue(); - VPValue *Step = WideIV->getStepValue(); - const InductionDescriptor &ID = WideIV->getInductionDescriptor(); - VPValue *EndValue = VectorTC; - if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { - EndValue = VectorPHBuilder.createDerivedIV( - ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), - Start, VectorTC, Step); - } - - // EndValue is derived from the vector trip count (which has the same type as - // the widest induction) and thus may be wider than the induction here. - Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); - if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { - EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, - ScalarTypeOfWideIV, - WideIV->getDebugLoc()); - } - - auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi( - {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val"); - return ResumePhiRecipe; -} - -/// Create resume phis in the scalar preheader for first-order recurrences, -/// reductions and inductions, and update the VPIRInstructions wrapping the -/// original phis in the scalar header. End values for inductions are added to -/// \p IVEndValues. -static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, - DenseMap<VPValue *, VPValue *> &IVEndValues) { - VPTypeAnalysis TypeInfo(Plan); - auto *ScalarPH = Plan.getScalarPreheader(); - auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]); - VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); - VPBuilder VectorPHBuilder( - cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())); - VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - VPBuilder ScalarPHBuilder(ScalarPH); - for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) { - auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR); - - // TODO: Extract final value from induction recipe initially, optimize to - // pre-computed end value together in optimizeInductionExitUsers. - auto *VectorPhiR = - cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi())); - if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { - if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( - WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, - &Plan.getVectorTripCount())) { - assert(isa<VPPhi>(ResumePhi) && "Expected a phi"); - IVEndValues[WideIVR] = ResumePhi->getOperand(0); - ScalarPhiIRI->addOperand(ResumePhi); - continue; - } - // TODO: Also handle truncated inductions here. Computing end-values - // separately should be done as VPlan-to-VPlan optimization, after - // legalizing all resume values to use the last lane from the loop. - assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && - "should only skip truncated wide inductions"); - continue; - } - - // The backedge value provides the value to resume coming out of a loop, - // which for FORs is a vector whose last element needs to be extracted. The - // start value provides the value if the loop is bypassed. - bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); - auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); - assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && - "Cannot handle loops with uncountable early exits"); - if (IsFOR) - ResumeFromVectorLoop = MiddleBuilder.createNaryOp( - VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {}, - "vector.recur.extract"); - StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; - auto *ResumePhiR = ScalarPHBuilder.createScalarPhi( - {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); - ScalarPhiIRI->addOperand(ResumePhiR); - } -} - -/// Handle users in the exit block for first order reductions in the original -/// exit block. The penultimate value of recurrences is fed to their LCSSA phi -/// users in the original exit block using the VPIRInstruction wrapping to the -/// LCSSA phi. -static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { - VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); - auto *ScalarPHVPBB = Plan.getScalarPreheader(); - auto *MiddleVPBB = Plan.getMiddleBlock(); - VPBuilder ScalarPHBuilder(ScalarPHVPBB); - VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - - auto IsScalableOne = [](ElementCount VF) -> bool { - return VF == ElementCount::getScalable(1); - }; - - for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { - auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); - if (!FOR) - continue; - - assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && - "Cannot handle loops with uncountable early exits"); - - // This is the second phase of vectorizing first-order recurrences, creating - // extract for users outside the loop. An overview of the transformation is - // described below. Suppose we have the following loop with some use after - // the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s.init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s.init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts - // for users in the scalar preheader and exit block. - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3] - // b[i] = v2 - v1 - // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v1 - // br cond, vector.body, middle.block - // - // middle.block: - // vector.recur.extract.for.phi = v2(2) - // vector.recur.extract = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // scalar.recur.init = phi [vector.recur.extract, middle.block], - // [s.init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], - // [vector.recur.extract.for.phi, middle.block] - // - // Now update VPIRInstructions modeling LCSSA phis in the exit block. - // Extract the penultimate value of the recurrence and use it as operand for - // the VPIRInstruction modeling the phi. - for (VPUser *U : FOR->users()) { - using namespace llvm::VPlanPatternMatch; - if (!match(U, m_ExtractLastElement(m_Specific(FOR)))) - continue; - // For VF vscale x 1, if vscale = 1, we are unable to extract the - // penultimate value of the recurrence. Instead we rely on the existing - // extract of the last element from the result of - // VPInstruction::FirstOrderRecurrenceSplice. - // TODO: Consider vscale_range info and UF. - if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne, - Range)) - return; - VPValue *PenultimateElement = MiddleBuilder.createNaryOp( - VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()}, - {}, "vector.recur.extract.for.phi"); - cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement); - } - } -} - VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) { @@ -8598,9 +8393,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( R->setOperand(1, WideIV->getStepValue()); } - addExitUsersForFirstOrderRecurrences(*Plan, Range); + VPlanTransforms::runPass( + VPlanTransforms::addExitUsersForFirstOrderRecurrences, *Plan, Range); DenseMap<VPValue *, VPValue *> IVEndValues; - addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); + VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan, + RecipeBuilder, IVEndValues); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8711,7 +8508,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { DenseMap<VPValue *, VPValue *> IVEndValues; // TODO: IVEndValues are not used yet in the native path, to optimize exit // values. - addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); + VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan, + RecipeBuilder, IVEndValues); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fedca65..91c3d42 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis { /// Checks if the opcode is supported as the main opcode for copyable /// elements. static bool isSupportedOpcode(const unsigned Opcode) { - return Opcode == Instruction::Add || Opcode == Instruction::LShr; + return Opcode == Instruction::Add || Opcode == Instruction::LShr || + Opcode == Instruction::Shl; } /// Identifies the best candidate value, which represents main opcode @@ -10937,6 +10938,7 @@ public: switch (MainOpcode) { case Instruction::Add: case Instruction::LShr: + case Instruction::Shl: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: @@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote( return all_of(E.Scalars, [&](Value *V) { if (isa<PoisonValue>(V)) return true; + if (E.isCopyableElement(V)) + return true; auto *I = cast<Instruction>(V); KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); return AmtKnownBits.getMaxValue().ult(BitWidth); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ca63bf3..ebf833e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4198,3 +4198,202 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator( MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false); MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights); } + +/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the +/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute +/// the end value of the induction. +static VPInstruction *addResumePhiRecipeForInduction( + VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, + VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { + auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); + // Truncated wide inductions resume from the last lane of their vector value + // in the last vector iteration which is handled elsewhere. + if (WideIntOrFp && WideIntOrFp->getTruncInst()) + return nullptr; + + VPValue *Start = WideIV->getStartValue(); + VPValue *Step = WideIV->getStepValue(); + const InductionDescriptor &ID = WideIV->getInductionDescriptor(); + VPValue *EndValue = VectorTC; + if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { + EndValue = VectorPHBuilder.createDerivedIV( + ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), + Start, VectorTC, Step); + } + + // EndValue is derived from the vector trip count (which has the same type as + // the widest induction) and thus may be wider than the induction here. + Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); + if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { + EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, + ScalarTypeOfWideIV, + WideIV->getDebugLoc()); + } + + auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi( + {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val"); + return ResumePhiRecipe; +} + +void VPlanTransforms::addScalarResumePhis( + VPlan &Plan, VPRecipeBuilder &Builder, + DenseMap<VPValue *, VPValue *> &IVEndValues) { + VPTypeAnalysis TypeInfo(Plan); + auto *ScalarPH = Plan.getScalarPreheader(); + auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBuilder VectorPHBuilder( + cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())); + VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + VPBuilder ScalarPHBuilder(ScalarPH); + for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) { + auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR); + + // TODO: Extract final value from induction recipe initially, optimize to + // pre-computed end value together in optimizeInductionExitUsers. + auto *VectorPhiR = + cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi())); + if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { + if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( + WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, + &Plan.getVectorTripCount())) { + assert(isa<VPPhi>(ResumePhi) && "Expected a phi"); + IVEndValues[WideIVR] = ResumePhi->getOperand(0); + ScalarPhiIRI->addOperand(ResumePhi); + continue; + } + // TODO: Also handle truncated inductions here. Computing end-values + // separately should be done as VPlan-to-VPlan optimization, after + // legalizing all resume values to use the last lane from the loop. + assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && + "should only skip truncated wide inductions"); + continue; + } + + // The backedge value provides the value to resume coming out of a loop, + // which for FORs is a vector whose last element needs to be extracted. The + // start value provides the value if the loop is bypassed. + bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); + auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); + assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && + "Cannot handle loops with uncountable early exits"); + if (IsFOR) + ResumeFromVectorLoop = MiddleBuilder.createNaryOp( + VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {}, + "vector.recur.extract"); + StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; + auto *ResumePhiR = ScalarPHBuilder.createScalarPhi( + {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); + ScalarPhiIRI->addOperand(ResumePhiR); + } +} + +void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, + VFRange &Range) { + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + auto *ScalarPHVPBB = Plan.getScalarPreheader(); + auto *MiddleVPBB = Plan.getMiddleBlock(); + VPBuilder ScalarPHBuilder(ScalarPHVPBB); + VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + + auto IsScalableOne = [](ElementCount VF) -> bool { + return VF == ElementCount::getScalable(1); + }; + + for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { + auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); + if (!FOR) + continue; + + assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && + "Cannot handle loops with uncountable early exits"); + + // This is the second phase of vectorizing first-order recurrences, creating + // extract for users outside the loop. An overview of the transformation is + // described below. Suppose we have the following loop with some use after + // the loop of the last a[i-1], + // + // for (int i = 0; i < n; ++i) { + // t = a[i - 1]; + // b[i] = a[i] - t; + // } + // use t; + // + // There is a first-order recurrence on "a". For this loop, the shorthand + // scalar IR looks like: + // + // scalar.ph: + // s.init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s.init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // use = lcssa.phi [s1, scalar.body] + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts + // for users in the scalar preheader and exit block. + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3] + // b[i] = v2 - v1 + // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v1 + // br cond, vector.body, middle.block + // + // middle.block: + // vector.recur.extract.for.phi = v2(2) + // vector.recur.extract = v2(3) + // br cond, scalar.ph, exit.block + // + // scalar.ph: + // scalar.recur.init = phi [vector.recur.extract, middle.block], + // [s.init, otherwise] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // lo = lcssa.phi [s1, scalar.body], + // [vector.recur.extract.for.phi, middle.block] + // + // Now update VPIRInstructions modeling LCSSA phis in the exit block. + // Extract the penultimate value of the recurrence and use it as operand for + // the VPIRInstruction modeling the phi. + for (VPUser *U : FOR->users()) { + using namespace llvm::VPlanPatternMatch; + if (!match(U, m_ExtractLastElement(m_Specific(FOR)))) + continue; + // For VF vscale x 1, if vscale = 1, we are unable to extract the + // penultimate value of the recurrence. Instead we rely on the existing + // extract of the last element from the result of + // VPInstruction::FirstOrderRecurrenceSplice. + // TODO: Consider vscale_range info and UF. + if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne, + Range)) + return; + VPValue *PenultimateElement = MiddleBuilder.createNaryOp( + VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()}, + {}, "vector.recur.extract.for.phi"); + cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement); + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 2f00e51..5a8a2bb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -363,6 +363,19 @@ struct VPlanTransforms { static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning); + + /// Create resume phis in the scalar preheader for first-order recurrences, + /// reductions and inductions, and update the VPIRInstructions wrapping the + /// original phis in the scalar header. End values for inductions are added to + /// \p IVEndValues. + static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, + DenseMap<VPValue *, VPValue *> &IVEndValues); + + /// Handle users in the exit block for first order reductions in the original + /// exit block. The penultimate value of recurrences is fed to their LCSSA phi + /// users in the original exit block using the VPIRInstruction wrapping to the + /// LCSSA phi. + static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); }; } // namespace llvm |