diff options
author | Florian Hahn <flo@fhahn.com> | 2025-05-15 19:15:48 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-15 19:15:48 +0100 |
commit | bb10c3ba7f77d40a7fbfd4ac815015d3a4ae476a (patch) | |
tree | b77b8131e7315932f4986de6fa17358675232cf8 /llvm/lib/Transforms/Utils/LoopPeel.cpp | |
parent | 1acac5cd38210131c543e4635fcbfd4d597e15f5 (diff) | |
download | llvm-bb10c3ba7f77d40a7fbfd4ac815015d3a4ae476a.zip llvm-bb10c3ba7f77d40a7fbfd4ac815015d3a4ae476a.tar.gz llvm-bb10c3ba7f77d40a7fbfd4ac815015d3a4ae476a.tar.bz2 |
[LoopPeel] Implement initial peeling off the last loop iteration. (#139551)
Generalize countToEliminateCompares to also consider peeling off the
last iteration if it eliminates a compare.
At the moment, codegen for peeling off the last iteration is quite
restrictive and callers have to make sure that the exit condition can be
adjusted when peeling and that the loop executes at least 2 iterations.
Both will be relaxed in follow-ups.
PR: https://github.com/llvm/llvm-project/pull/139551
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopPeel.cpp')
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopPeel.cpp | 377 |
1 files changed, 271 insertions, 106 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index f6ace9c..f15252b 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -49,6 +49,7 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-peel" STATISTIC(NumPeeled, "Number of loops peeled"); +STATISTIC(NumPeeledEnd, "Number of loops peeled from end"); static cl::opt<unsigned> UnrollPeelCount( "unroll-peel-count", cl::Hidden, @@ -325,19 +326,71 @@ static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L, return 0; } -// Return the number of iterations to peel off that make conditions in the -// body true/false. For example, if we peel 2 iterations off the loop below, -// the condition i < 2 can be evaluated at compile time. +bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) { + const SCEV *BTC = SE.getBackedgeTakenCount(&L); + Value *Inc; + CmpPredicate Pred; + BasicBlock *Succ1; + BasicBlock *Succ2; + // The loop must execute at least 2 iterations to guarantee that peeled + // iteration executes. + // TODO: Add checks during codegen. + if (isa<SCEVCouldNotCompute>(BTC) || + !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getZero(BTC->getType()))) + return false; + + // Check if the exit condition of the loop can be adjusted by the peeling + // codegen. For now, it must + // * exit via the latch, + // * the exit condition must be a NE/EQ compare of an induction with step + // of 1. + BasicBlock *Latch = L.getLoopLatch(); + return Latch && Latch == L.getExitingBlock() && + match(Latch->getTerminator(), + m_Br(m_ICmp(Pred, m_Value(Inc), m_Value()), m_BasicBlock(Succ1), + m_BasicBlock(Succ2))) && + ((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) || + (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) && + isa<SCEVAddRecExpr>(SE.getSCEV(Inc)) && + cast<SCEVAddRecExpr>(SE.getSCEV(Inc))->getStepRecurrence(SE)->isOne(); +} + +/// Returns true if the last iteration can be peeled off and the condition (Pred +/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition +/// is known at the second-to-last. +static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred, + const SCEVAddRecExpr *LeftAR, + const SCEV *RightSCEV, + ScalarEvolution &SE) { + if (!canPeelLastIteration(L, SE)) + return false; + + const SCEV *BTC = SE.getBackedgeTakenCount(&L); + const SCEV *ValAtLastIter = LeftAR->evaluateAtIteration(BTC, SE); + const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration( + SE.getMinusSCEV(BTC, SE.getOne(BTC->getType())), SE); + + return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), ValAtLastIter, + RightSCEV) && + SE.isKnownPredicate(Pred, ValAtSecondToLastIter, RightSCEV); +} + +// Return the number of iterations to peel off from the beginning and end of the +// loop respectively, that make conditions in the body true/false. For example, +// if we peel 2 iterations off the loop below, the condition i < 2 can be +// evaluated at compile time. +// // for (i = 0; i < n; i++) // if (i < 2) // .. // else // .. // } -static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, - ScalarEvolution &SE) { +static std::pair<unsigned, unsigned> +countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE) { assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form"); unsigned DesiredPeelCount = 0; + unsigned DesiredPeelCountLast = 0; // Do not peel the entire loop. const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(&L); @@ -421,8 +474,11 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, const SCEV *Step = LeftAR->getStepRecurrence(SE); if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step, - Pred)) + Pred)) { + if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE)) + DesiredPeelCountLast = 1; return; + } // However, for equality comparisons, that isn't always sufficient to // eliminate the comparsion in loop body, we may need to peel one more @@ -439,6 +495,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, } DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount); + DesiredPeelCountLast = std::max(DesiredPeelCountLast, NewPeelCount); }; auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) { @@ -500,7 +557,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ComputePeelCount(BI->getCondition(), 0); } - return DesiredPeelCount; + return {DesiredPeelCount, DesiredPeelCountLast}; } /// This "heuristic" exactly matches implicit behavior which used to exist @@ -593,8 +650,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels); } - DesiredPeelCount = std::max(DesiredPeelCount, - countToEliminateCompares(*L, MaxPeelCount, SE)); + const auto &[CountToEliminateCmps, CountToEliminateCmpsLast] = + countToEliminateCompares(*L, MaxPeelCount, SE); + DesiredPeelCount = std::max(DesiredPeelCount, CountToEliminateCmps); if (DesiredPeelCount == 0) DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC); @@ -609,6 +667,23 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, << " some Phis into invariants.\n"); PP.PeelCount = DesiredPeelCount; PP.PeelProfiledIterations = false; + PP.PeelLast = false; + return; + } + } + + if (CountToEliminateCmpsLast > 0) { + unsigned DesiredPeelCountLast = + std::min(CountToEliminateCmpsLast, MaxPeelCount); + // Consider max peel count limitation. + assert(DesiredPeelCountLast > 0 && "Wrong loop size estimation?"); + if (DesiredPeelCountLast + AlreadyPeeled <= UnrollPeelMaxCount) { + LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount + << " iteration(s) to turn" + << " some Phis into invariants.\n"); + PP.PeelCount = DesiredPeelCountLast; + PP.PeelProfiledIterations = false; + PP.PeelLast = true; return; } } @@ -733,6 +808,7 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, /// InsertBot. /// \param IterNumber The serial number of the iteration currently being /// peeled off. +/// \param PeelLast Peel off the last iterations from \p L. /// \param ExitEdges The exit edges of the original loop. /// \param[out] NewBlocks A list of the blocks in the newly created clone /// \param[out] VMap The value map between the loop and the new clone. @@ -740,7 +816,8 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, /// \param LVMap A value-map that maps instructions from the original loop to /// instructions in the last peeled-off iteration. static void cloneLoopBlocks( - Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot, + Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop, + BasicBlock *InsertBot, SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges, SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT, @@ -804,16 +881,26 @@ static void cloneLoopBlocks( // Similarly, for the latch: // The original exiting edge is still hooked up to the loop exit. - // The backedge now goes to the "bottom", which is either the loop's real - // header (for the last peeled iteration) or the copied header of the next - // iteration (for every other iteration) BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]); - auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator()); - for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) - if (LatchTerm->getSuccessor(idx) == Header) { - LatchTerm->setSuccessor(idx, InsertBot); - break; + if (PeelLast) { + // This is the last iteration and we definitely will go to the exit. Just + // set both successors to InsertBot and let the branch be simplified later. + assert(IterNumber == 0 && "Only peeling a single iteration implemented."); + auto *LatchTerm = cast<BranchInst>(NewLatch->getTerminator()); + LatchTerm->setSuccessor(0, InsertBot); + LatchTerm->setSuccessor(1, InsertBot); + } else { + auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator()); + // The backedge now goes to the "bottom", which is either the loop's real + // header (for the last peeled iteration) or the copied header of the next + // iteration (for every other iteration) + for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) { + if (LatchTerm->getSuccessor(idx) == Header) { + LatchTerm->setSuccessor(idx, InsertBot); + break; + } } + } if (DT) DT->changeImmediateDominator(InsertBot, NewLatch); @@ -821,23 +908,33 @@ static void cloneLoopBlocks( // that pick an incoming value from either the preheader, or the previous // loop iteration. Since this copy is no longer part of the loop, we // resolve this statically: - // For the first iteration, we use the value from the preheader directly. - // For any other iteration, we replace the phi with the value generated by - // the immediately preceding clone of the loop body (which represents - // the previous iteration). - for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *NewPHI = cast<PHINode>(VMap[&*I]); - if (IterNumber == 0) { - VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader); - } else { - Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch); - Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); - if (LatchInst && L->contains(LatchInst)) - VMap[&*I] = LVMap[LatchInst]; - else - VMap[&*I] = LatchVal; + if (PeelLast) { + // For the last iteration, we use the value from the latch of the original + // loop directly. + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *NewPHI = cast<PHINode>(VMap[&*I]); + VMap[&*I] = NewPHI->getIncomingValueForBlock(Latch); + NewPHI->eraseFromParent(); + } + } else { + // For the first iteration, we use the value from the preheader directly. + // For any other iteration, we replace the phi with the value generated by + // the immediately preceding clone of the loop body (which represents + // the previous iteration). + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *NewPHI = cast<PHINode>(VMap[&*I]); + if (IterNumber == 0) { + VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader); + } else { + Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch); + Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); + if (LatchInst && L->contains(LatchInst)) + VMap[&*I] = LVMap[LatchInst]; + else + VMap[&*I] = LatchVal; + } + NewPHI->eraseFromParent(); } - NewPHI->eraseFromParent(); } // Fix up the outgoing values - we need to add a value for the iteration @@ -905,11 +1002,14 @@ llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, /// this provides a benefit, since the peeled off iterations, which account /// for the bulk of dynamic execution, can be further simplified by scalar /// optimizations. -bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, +bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &LVMap) { assert(PeelCount > 0 && "Attempt to peel out zero iterations?"); assert(canPeel(L) && "Attempt to peel a loop which is not peelable?"); + assert((!PeelLast || (canPeelLastIteration(*L, *SE) && PeelCount == 1)) && + "when peeling the last iteration, the loop must be supported and can " + "only peel a single iteration"); LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); @@ -944,60 +1044,99 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, Function *F = Header->getParent(); - // Set up all the necessary basic blocks. It is convenient to split the - // preheader into 3 parts - two blocks to anchor the peeled copy of the loop - // body, and a new preheader for the "real" loop. - - // Peeling the first iteration transforms. - // - // PreHeader: - // ... - // Header: - // LoopBody - // If (cond) goto Header - // Exit: - // - // into - // - // InsertTop: - // LoopBody - // If (!cond) goto Exit - // InsertBot: - // NewPreHeader: - // ... - // Header: - // LoopBody - // If (cond) goto Header - // Exit: - // - // Each following iteration will split the current bottom anchor in two, - // and put the new copy of the loop body between these two blocks. That is, - // after peeling another iteration from the example above, we'll split - // InsertBot, and get: - // - // InsertTop: - // LoopBody - // If (!cond) goto Exit - // InsertBot: - // LoopBody - // If (!cond) goto Exit - // InsertBot.next: - // NewPreHeader: - // ... - // Header: - // LoopBody - // If (cond) goto Header - // Exit: - - BasicBlock *InsertTop = SplitEdge(PreHeader, Header, &DT, LI); - BasicBlock *InsertBot = - SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); - BasicBlock *NewPreHeader = - SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI); - - InsertTop->setName(Header->getName() + ".peel.begin"); - InsertBot->setName(Header->getName() + ".peel.next"); - NewPreHeader->setName(PreHeader->getName() + ".peel.newph"); + // Set up all the necessary basic blocks. + BasicBlock *InsertTop; + BasicBlock *InsertBot; + BasicBlock *NewPreHeader; + DenseMap<Instruction *, Value *> ExitValues; + if (PeelLast) { + // It is convenient to split the single exit block from the latch the + // into 3 parts - two blocks to anchor the peeled copy of the loop body, + // and a new final exit block. + + // Peeling the last iteration transforms. + // + // PreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + // into + // + // Header: + // LoopBody + // If (cond) goto Header + // InsertTop: + // LoopBody + // If (!cond) goto InsertBot + // InsertBot: + // Exit: + // ... + BasicBlock *Exit = L->getExitBlock(); + for (PHINode &P : Exit->phis()) + ExitValues[&P] = P.getIncomingValueForBlock(Latch); + + InsertTop = SplitEdge(Latch, Exit, &DT, LI); + InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); + + InsertTop->setName(Exit->getName() + ".peel.begin"); + InsertBot->setName(Exit->getName() + ".peel.next"); + } else { + // It is convenient to split the preheader into 3 parts - two blocks to + // anchor the peeled copy of the loop body, and a new preheader for the + // "real" loop. + + // Peeling the first iteration transforms. + // + // PreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + // into + // + // InsertTop: + // LoopBody + // If (!cond) goto Exit + // InsertBot: + // NewPreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + // Each following iteration will split the current bottom anchor in two, + // and put the new copy of the loop body between these two blocks. That + // is, after peeling another iteration from the example above, we'll + // split InsertBot, and get: + // + // InsertTop: + // LoopBody + // If (!cond) goto Exit + // InsertBot: + // LoopBody + // If (!cond) goto Exit + // InsertBot.next: + // NewPreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + InsertTop = SplitEdge(PreHeader, Header, &DT, LI); + InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); + NewPreHeader = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI); + + InsertTop->setName(Header->getName() + ".peel.begin"); + InsertBot->setName(Header->getName() + ".peel.next"); + NewPreHeader->setName(PreHeader->getName() + ".peel.newph"); + } Instruction *LatchTerm = cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator()); @@ -1013,23 +1152,40 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes); // For each peeled-off iteration, make a copy of the loop. + ValueToValueMapTy VMap; for (unsigned Iter = 0; Iter < PeelCount; ++Iter) { SmallVector<BasicBlock *, 8> NewBlocks; - ValueToValueMapTy VMap; - cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks, - LoopBlocks, VMap, LVMap, &DT, LI, + cloneLoopBlocks(L, Iter, PeelLast, InsertTop, InsertBot, ExitEdges, + NewBlocks, LoopBlocks, VMap, LVMap, &DT, LI, LoopLocalNoAliasDeclScopes, *SE); // Remap to use values from the current iteration instead of the // previous one. remapInstructionsInBlocks(NewBlocks, VMap); - // Update IDoms of the blocks reachable through exits. - if (Iter == 0) - for (auto BBIDom : NonLoopBlocksIDom) - DT.changeImmediateDominator(BBIDom.first, - cast<BasicBlock>(LVMap[BBIDom.second])); + if (Iter == 0) { + if (PeelLast) { + // Adjust the exit condition so the loop exits one iteration early. + // For now we simply subtract one form the second operand of the + // exit condition. This relies on the peel count computation to + // check that this is actually legal. In particular, it ensures that + // the first operand of the compare is an AddRec with step 1 and we + // execute more than one iteration. + auto *Cmp = + cast<ICmpInst>(L->getLoopLatch()->getTerminator()->getOperand(0)); + IRBuilder B(Cmp); + Cmp->setOperand( + 1, B.CreateSub(Cmp->getOperand(1), + ConstantInt::get(Cmp->getOperand(1)->getType(), 1))); + } else { + // Update IDoms of the blocks reachable through exits. + for (auto BBIDom : NonLoopBlocksIDom) + DT.changeImmediateDominator(BBIDom.first, + cast<BasicBlock>(LVMap[BBIDom.second])); + } + } + #ifdef EXPENSIVE_CHECKS assert(DT.verify(DominatorTree::VerificationLevel::Fast)); #endif @@ -1052,16 +1208,24 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, F->end()); } - // Now adjust the phi nodes in the loop header to get their initial values - // from the last peeled-off iteration instead of the preheader. - for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *PHI = cast<PHINode>(I); - Value *NewVal = PHI->getIncomingValueForBlock(Latch); - Instruction *LatchInst = dyn_cast<Instruction>(NewVal); - if (LatchInst && L->contains(LatchInst)) - NewVal = LVMap[LatchInst]; + if (PeelLast) { + // Now adjust users of the original exit values by replacing them with the + // exit value from the peeled iteration. + for (const auto &[P, E] : ExitValues) + P->replaceAllUsesWith(VMap.lookup(E)); + formLCSSA(*L, DT, LI, SE); + } else { + // Now adjust the phi nodes in the loop header to get their initial values + // from the last peeled-off iteration instead of the preheader. + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *PHI = cast<PHINode>(I); + Value *NewVal = PHI->getIncomingValueForBlock(Latch); + Instruction *LatchInst = dyn_cast<Instruction>(NewVal); + if (LatchInst && L->contains(LatchInst)) + NewVal = LVMap[LatchInst]; - PHI->setIncomingValueForBlock(NewPreHeader, NewVal); + PHI->setIncomingValueForBlock(NewPreHeader, NewVal); + } } for (const auto &[Term, Info] : Weights) { @@ -1090,6 +1254,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA); NumPeeled++; + NumPeeledEnd += PeelLast; return true; } |