diff options
author | David Sherwood <david.sherwood@arm.com> | 2023-06-06 08:06:37 +0000 |
---|---|---|
committer | David Sherwood <david.sherwood@arm.com> | 2023-08-24 12:14:02 +0000 |
commit | c02184f286d13160460f0b2e38e4fd2a3edff7f0 (patch) | |
tree | f6f20ec9d0e6b3f39aeb0a5c4444b4d458931285 /llvm/lib/Transforms/Utils/LoopUtils.cpp | |
parent | d86a7d631c32341fd86fa5ecd247957cdb2c58d1 (diff) | |
download | llvm-c02184f286d13160460f0b2e38e4fd2a3edff7f0.zip llvm-c02184f286d13160460f0b2e38e4fd2a3edff7f0.tar.gz llvm-c02184f286d13160460f0b2e38e4fd2a3edff7f0.tar.bz2 |
[LoopVectorize] Allow inner loop runtime checks to be hoisted above an outer loop
Suppose we have a nested loop like this:
void foo(int32_t *dst, int32_t *src, int m, int n) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
dst[(i * n) + j] += src[(i * n) + j];
}
}
}
We currently generate runtime memory checks as a precondition for
entering the vectorised version of the inner loop. However, if the
runtime-determined trip count for the inner loop is quite small
then the cost of these checks becomes quite expensive. This patch
attempts to mitigate these costs by adding a new option to
expand the memory ranges being checked to include the outer loop
as well. This leads to runtime checks that can then be hoisted
above the outer loop. For example, rather than looking for a
conflict between the memory ranges:
1. &dst[(i * n)] -> &dst[(i * n) + n]
2. &src[(i * n)] -> &src[(i * n) + n]
we can instead look at the expanded ranges:
1. &dst[0] -> &dst[((m - 1) * n) + n]
2. &src[0] -> &src[((m - 1) * n) + n]
which are outer-loop-invariant. As with many optimisations there
is a trade-off here, because there is a danger that using the
expanded ranges we may never enter the vectorised inner loop,
whereas with the smaller ranges we might enter at least once.
I have added a HoistRuntimeChecks option that is turned off by
default, but can be enabled for workloads where we know this is
guaranteed to be of real benefit. In future, we can also use
PGO to determine if this is worthwhile by using the inner loop
trip count information.
When enabling this option for SPEC2017 on neoverse-v1 with the
flags "-Ofast -mcpu=native -flto" I see an overall geomean
improvement of ~0.5%:
SPEC2017 results (+ is an improvement, - is a regression):
520.omnetpp: +2%
525.x264: +2%
557.xz: +1.2%
...
GEOMEAN: +0.5%
I didn't investigate all the differences to see if they are
genuine or noise, but I know the x264 improvement is real because
it has some hot nested loops with low trip counts where I can
see this hoisting is beneficial.
Tests have been added here:
Transforms/LoopVectorize/runtime-checks-hoist.ll
Differential Revision: https://reviews.llvm.org/D152366
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUtils.cpp')
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopUtils.cpp | 83 |
1 files changed, 73 insertions, 10 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 6e0c195..1b2b371 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1628,42 +1628,92 @@ Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, struct PointerBounds { TrackingVH<Value> Start; TrackingVH<Value> End; + Value *StrideToCheck; }; /// Expand code for the lower and upper bound of the pointer group \p CG /// in \p TheLoop. \return the values for the bounds. static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, Loop *TheLoop, Instruction *Loc, - SCEVExpander &Exp) { + SCEVExpander &Exp, bool HoistRuntimeChecks) { LLVMContext &Ctx = Loc->getContext(); Type *PtrArithTy = PointerType::get(Ctx, CG->AddressSpace); Value *Start = nullptr, *End = nullptr; LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n"); - Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc); - End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc); + const SCEV *Low = CG->Low, *High = CG->High, *Stride = nullptr; + + // If the Low and High values are themselves loop-variant, then we may want + // to expand the range to include those covered by the outer loop as well. + // There is a trade-off here with the advantage being that creating checks + // using the expanded range permits the runtime memory checks to be hoisted + // out of the outer loop. This reduces the cost of entering the inner loop, + // which can be significant for low trip counts. The disadvantage is that + // there is a chance we may now never enter the vectorized inner loop, + // whereas using a restricted range check could have allowed us to enter at + // least once. This is why the behaviour is not currently the default and is + // controlled by the parameter 'HoistRuntimeChecks'. + if (HoistRuntimeChecks && TheLoop->getParentLoop() && + isa<SCEVAddRecExpr>(High) && isa<SCEVAddRecExpr>(Low)) { + auto *HighAR = cast<SCEVAddRecExpr>(High); + auto *LowAR = cast<SCEVAddRecExpr>(Low); + const Loop *OuterLoop = TheLoop->getParentLoop(); + const SCEV *Recur = LowAR->getStepRecurrence(*Exp.getSE()); + if (Recur == HighAR->getStepRecurrence(*Exp.getSE()) && + HighAR->getLoop() == OuterLoop && LowAR->getLoop() == OuterLoop) { + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + const SCEV *OuterExitCount = + Exp.getSE()->getExitCount(OuterLoop, OuterLoopLatch); + if (!isa<SCEVCouldNotCompute>(OuterExitCount) && + OuterExitCount->getType()->isIntegerTy()) { + const SCEV *NewHigh = cast<SCEVAddRecExpr>(High)->evaluateAtIteration( + OuterExitCount, *Exp.getSE()); + if (!isa<SCEVCouldNotCompute>(NewHigh)) { + LLVM_DEBUG(dbgs() << "LAA: Expanded RT check for range to include " + "outer loop in order to permit hoisting\n"); + High = NewHigh; + Low = cast<SCEVAddRecExpr>(Low)->getStart(); + // If there is a possibility that the stride is negative then we have + // to generate extra checks to ensure the stride is positive. + if (!Exp.getSE()->isKnownNonNegative(Recur)) { + Stride = Recur; + LLVM_DEBUG(dbgs() << "LAA: ... but need to check stride is " + "positive: " + << *Stride << '\n'); + } + } + } + } + } + + Start = Exp.expandCodeFor(Low, PtrArithTy, Loc); + End = Exp.expandCodeFor(High, PtrArithTy, Loc); if (CG->NeedsFreeze) { IRBuilder<> Builder(Loc); Start = Builder.CreateFreeze(Start, Start->getName() + ".fr"); End = Builder.CreateFreeze(End, End->getName() + ".fr"); } - LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n"); - return {Start, End}; + Value *StrideVal = + Stride ? Exp.expandCodeFor(Stride, Type::getInt64Ty(Ctx), Loc) : nullptr; + LLVM_DEBUG(dbgs() << "Start: " << *Low << " End: " << *High << "\n"); + return {Start, End, StrideVal}; } /// Turns a collection of checks into a collection of expanded upper and /// lower bounds for both pointers in the check. static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L, - Instruction *Loc, SCEVExpander &Exp) { + Instruction *Loc, SCEVExpander &Exp, bool HoistRuntimeChecks) { SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds; // Here we're relying on the SCEV Expander's cache to only emit code for the // same bounds once. transform(PointerChecks, std::back_inserter(ChecksWithBounds), [&](const RuntimePointerCheck &Check) { - PointerBounds First = expandBounds(Check.first, L, Loc, Exp), - Second = expandBounds(Check.second, L, Loc, Exp); + PointerBounds First = expandBounds(Check.first, L, Loc, Exp, + HoistRuntimeChecks), + Second = expandBounds(Check.second, L, Loc, Exp, + HoistRuntimeChecks); return std::make_pair(First, Second); }); @@ -1673,10 +1723,11 @@ expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L, Value *llvm::addRuntimeChecks( Instruction *Loc, Loop *TheLoop, const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, - SCEVExpander &Exp) { + SCEVExpander &Exp, bool HoistRuntimeChecks) { // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible - auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp); + auto ExpandedChecks = + expandBounds(PointerChecks, TheLoop, Loc, Exp, HoistRuntimeChecks); LLVMContext &Ctx = Loc->getContext(); IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx, @@ -1707,6 +1758,18 @@ Value *llvm::addRuntimeChecks( Value *Cmp0 = ChkBuilder.CreateICmpULT(A.Start, B.End, "bound0"); Value *Cmp1 = ChkBuilder.CreateICmpULT(B.Start, A.End, "bound1"); Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); + if (A.StrideToCheck) { + Value *IsNegativeStride = ChkBuilder.CreateICmpSLT( + A.StrideToCheck, ConstantInt::get(A.StrideToCheck->getType(), 0), + "stride.check"); + IsConflict = ChkBuilder.CreateOr(IsConflict, IsNegativeStride); + } + if (B.StrideToCheck) { + Value *IsNegativeStride = ChkBuilder.CreateICmpSLT( + B.StrideToCheck, ConstantInt::get(B.StrideToCheck->getType(), 0), + "stride.check"); + IsConflict = ChkBuilder.CreateOr(IsConflict, IsNegativeStride); + } if (MemoryRuntimeCheck) { IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); |