diff options
author | Julian Nagele <j.nagele@apple.com> | 2024-11-17 19:35:32 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-17 19:35:32 +0000 |
commit | a8538b9138574142b9338ad0fce0f8ba1065fcbc (patch) | |
tree | ea1d353b00a1fb006fc9bcaa72e7ca7ee0372be0 /llvm/lib | |
parent | feb9b3701bf6650f91e12e7f4efbe72383f3f60b (diff) | |
download | llvm-a8538b9138574142b9338ad0fce0f8ba1065fcbc.zip llvm-a8538b9138574142b9338ad0fce0f8ba1065fcbc.tar.gz llvm-a8538b9138574142b9338ad0fce0f8ba1065fcbc.tar.bz2 |
[LV] Vectorize Epilogues for loops with small VF but high IC (#108190)
- Consider MainLoopVF * IC when determining whether Epilogue
Vectorization is profitable
- Allow the same VF for the Epilogue as for the main loop
- Use an upper bound for the trip count of the Epilogue when choosing
the Epilogue VF
PR: https://github.com/llvm/llvm-project/pull/108190
---------
Co-authored-by: Florian Hahn <flo@fhahn.com>
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 48 |
2 files changed, 41 insertions, 13 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 7787f58..a6b5235 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -525,6 +525,12 @@ private: bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B) const; + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B in the context of vectorizing a loop with known \p MaxTripCount. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B, + const unsigned MaxTripCount) const; + /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. bool isCandidateForEpilogueVectorization(const ElementCount VF) const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index aa90003..1d9e4f5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1516,7 +1516,10 @@ public: /// Returns true if epilogue vectorization is considered profitable, and /// false otherwise. /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// \p Multiplier is an aditional scaling factor applied to VF before + /// comparing to EpilogueVectorizationMinVF. + bool isEpilogueVectorizationProfitable(const ElementCount VF, + const unsigned Multiplier) const; /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -4289,12 +4292,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { } bool LoopVectorizationPlanner::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B) const { + const VectorizationFactor &A, const VectorizationFactor &B, + const unsigned MaxTripCount) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); - // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); @@ -4343,6 +4345,12 @@ bool LoopVectorizationPlanner::isMoreProfitable( return CmpFn(RTCostA, RTCostB); } +bool LoopVectorizationPlanner::isMoreProfitable( + const VectorizationFactor &A, const VectorizationFactor &B) const { + const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); +} + void LoopVectorizationPlanner::emitInvalidCostRemarks( OptimizationRemarkEmitter *ORE) { using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>; @@ -4661,7 +4669,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( } bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( - const ElementCount VF) const { + const ElementCount VF, const unsigned Multiplier) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops @@ -4676,9 +4684,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - unsigned Multiplier = 1; - if (VF.isScalable()) - Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; @@ -4724,7 +4729,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } - if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { + unsigned Multiplier = IC; + if (MainLoopVF.isScalable()) + Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1); + + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -4743,16 +4752,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; + unsigned MaxTripCount = 0; for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) continue; - // Skip candidate VFs with widths >= the estimate runtime VF (scalable - // vectors) or the VF of the main loop (fixed vectors). + // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable + // vectors) or > the VF of the main loop (fixed vectors). if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) + (NextVF.Width.isScalable() && + ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || + (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && + ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) continue; // If NextVF is greater than the number of remaining iterations, the @@ -4766,6 +4779,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( "Trip count SCEV must be computable"); RemainingIterations = SE.getURemExpr( TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); + MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; + if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, + SE.getConstant(TCType, MaxTripCount))) { + MaxTripCount = + SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); + } + LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " + << MaxTripCount << "\n"); } if (SE.isKnownPredicate( CmpInst::ICMP_UGT, @@ -4774,7 +4795,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( continue; } - if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) + if (Result.Width.isScalar() || + isMoreProfitable(NextVF, Result, MaxTripCount)) Result = NextVF; } |