diff options
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopPeel.cpp')
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopPeel.cpp | 377 |
1 files changed, 271 insertions, 106 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index f6ace9c..f15252b 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -49,6 +49,7 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-peel" STATISTIC(NumPeeled, "Number of loops peeled"); +STATISTIC(NumPeeledEnd, "Number of loops peeled from end"); static cl::opt<unsigned> UnrollPeelCount( "unroll-peel-count", cl::Hidden, @@ -325,19 +326,71 @@ static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L, return 0; } -// Return the number of iterations to peel off that make conditions in the -// body true/false. For example, if we peel 2 iterations off the loop below, -// the condition i < 2 can be evaluated at compile time. +bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) { + const SCEV *BTC = SE.getBackedgeTakenCount(&L); + Value *Inc; + CmpPredicate Pred; + BasicBlock *Succ1; + BasicBlock *Succ2; + // The loop must execute at least 2 iterations to guarantee that peeled + // iteration executes. + // TODO: Add checks during codegen. + if (isa<SCEVCouldNotCompute>(BTC) || + !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getZero(BTC->getType()))) + return false; + + // Check if the exit condition of the loop can be adjusted by the peeling + // codegen. For now, it must + // * exit via the latch, + // * the exit condition must be a NE/EQ compare of an induction with step + // of 1. + BasicBlock *Latch = L.getLoopLatch(); + return Latch && Latch == L.getExitingBlock() && + match(Latch->getTerminator(), + m_Br(m_ICmp(Pred, m_Value(Inc), m_Value()), m_BasicBlock(Succ1), + m_BasicBlock(Succ2))) && + ((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) || + (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) && + isa<SCEVAddRecExpr>(SE.getSCEV(Inc)) && + cast<SCEVAddRecExpr>(SE.getSCEV(Inc))->getStepRecurrence(SE)->isOne(); +} + +/// Returns true if the last iteration can be peeled off and the condition (Pred +/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition +/// is known at the second-to-last. +static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred, + const SCEVAddRecExpr *LeftAR, + const SCEV *RightSCEV, + ScalarEvolution &SE) { + if (!canPeelLastIteration(L, SE)) + return false; + + const SCEV *BTC = SE.getBackedgeTakenCount(&L); + const SCEV *ValAtLastIter = LeftAR->evaluateAtIteration(BTC, SE); + const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration( + SE.getMinusSCEV(BTC, SE.getOne(BTC->getType())), SE); + + return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), ValAtLastIter, + RightSCEV) && + SE.isKnownPredicate(Pred, ValAtSecondToLastIter, RightSCEV); +} + +// Return the number of iterations to peel off from the beginning and end of the +// loop respectively, that make conditions in the body true/false. For example, +// if we peel 2 iterations off the loop below, the condition i < 2 can be +// evaluated at compile time. +// // for (i = 0; i < n; i++) // if (i < 2) // .. // else // .. // } -static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, - ScalarEvolution &SE) { +static std::pair<unsigned, unsigned> +countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE) { assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form"); unsigned DesiredPeelCount = 0; + unsigned DesiredPeelCountLast = 0; // Do not peel the entire loop. const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(&L); @@ -421,8 +474,11 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, const SCEV *Step = LeftAR->getStepRecurrence(SE); if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step, - Pred)) + Pred)) { + if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE)) + DesiredPeelCountLast = 1; return; + } // However, for equality comparisons, that isn't always sufficient to // eliminate the comparsion in loop body, we may need to peel one more @@ -439,6 +495,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, } DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount); + DesiredPeelCountLast = std::max(DesiredPeelCountLast, NewPeelCount); }; auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) { @@ -500,7 +557,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ComputePeelCount(BI->getCondition(), 0); } - return DesiredPeelCount; + return {DesiredPeelCount, DesiredPeelCountLast}; } /// This "heuristic" exactly matches implicit behavior which used to exist @@ -593,8 +650,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels); } - DesiredPeelCount = std::max(DesiredPeelCount, - countToEliminateCompares(*L, MaxPeelCount, SE)); + const auto &[CountToEliminateCmps, CountToEliminateCmpsLast] = + countToEliminateCompares(*L, MaxPeelCount, SE); + DesiredPeelCount = std::max(DesiredPeelCount, CountToEliminateCmps); if (DesiredPeelCount == 0) DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC); @@ -609,6 +667,23 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, << " some Phis into invariants.\n"); PP.PeelCount = DesiredPeelCount; PP.PeelProfiledIterations = false; + PP.PeelLast = false; + return; + } + } + + if (CountToEliminateCmpsLast > 0) { + unsigned DesiredPeelCountLast = + std::min(CountToEliminateCmpsLast, MaxPeelCount); + // Consider max peel count limitation. + assert(DesiredPeelCountLast > 0 && "Wrong loop size estimation?"); + if (DesiredPeelCountLast + AlreadyPeeled <= UnrollPeelMaxCount) { + LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount + << " iteration(s) to turn" + << " some Phis into invariants.\n"); + PP.PeelCount = DesiredPeelCountLast; + PP.PeelProfiledIterations = false; + PP.PeelLast = true; return; } } @@ -733,6 +808,7 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, /// InsertBot. /// \param IterNumber The serial number of the iteration currently being /// peeled off. +/// \param PeelLast Peel off the last iterations from \p L. /// \param ExitEdges The exit edges of the original loop. /// \param[out] NewBlocks A list of the blocks in the newly created clone /// \param[out] VMap The value map between the loop and the new clone. @@ -740,7 +816,8 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, /// \param LVMap A value-map that maps instructions from the original loop to /// instructions in the last peeled-off iteration. static void cloneLoopBlocks( - Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot, + Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop, + BasicBlock *InsertBot, SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges, SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT, @@ -804,16 +881,26 @@ static void cloneLoopBlocks( // Similarly, for the latch: // The original exiting edge is still hooked up to the loop exit. - // The backedge now goes to the "bottom", which is either the loop's real - // header (for the last peeled iteration) or the copied header of the next - // iteration (for every other iteration) BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]); - auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator()); - for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) - if (LatchTerm->getSuccessor(idx) == Header) { - LatchTerm->setSuccessor(idx, InsertBot); - break; + if (PeelLast) { + // This is the last iteration and we definitely will go to the exit. Just + // set both successors to InsertBot and let the branch be simplified later. + assert(IterNumber == 0 && "Only peeling a single iteration implemented."); + auto *LatchTerm = cast<BranchInst>(NewLatch->getTerminator()); + LatchTerm->setSuccessor(0, InsertBot); + LatchTerm->setSuccessor(1, InsertBot); + } else { + auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator()); + // The backedge now goes to the "bottom", which is either the loop's real + // header (for the last peeled iteration) or the copied header of the next + // iteration (for every other iteration) + for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) { + if (LatchTerm->getSuccessor(idx) == Header) { + LatchTerm->setSuccessor(idx, InsertBot); + break; + } } + } if (DT) DT->changeImmediateDominator(InsertBot, NewLatch); @@ -821,23 +908,33 @@ static void cloneLoopBlocks( // that pick an incoming value from either the preheader, or the previous // loop iteration. Since this copy is no longer part of the loop, we // resolve this statically: - // For the first iteration, we use the value from the preheader directly. - // For any other iteration, we replace the phi with the value generated by - // the immediately preceding clone of the loop body (which represents - // the previous iteration). - for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *NewPHI = cast<PHINode>(VMap[&*I]); - if (IterNumber == 0) { - VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader); - } else { - Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch); - Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); - if (LatchInst && L->contains(LatchInst)) - VMap[&*I] = LVMap[LatchInst]; - else - VMap[&*I] = LatchVal; + if (PeelLast) { + // For the last iteration, we use the value from the latch of the original + // loop directly. + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *NewPHI = cast<PHINode>(VMap[&*I]); + VMap[&*I] = NewPHI->getIncomingValueForBlock(Latch); + NewPHI->eraseFromParent(); + } + } else { + // For the first iteration, we use the value from the preheader directly. + // For any other iteration, we replace the phi with the value generated by + // the immediately preceding clone of the loop body (which represents + // the previous iteration). + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *NewPHI = cast<PHINode>(VMap[&*I]); + if (IterNumber == 0) { + VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader); + } else { + Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch); + Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); + if (LatchInst && L->contains(LatchInst)) + VMap[&*I] = LVMap[LatchInst]; + else + VMap[&*I] = LatchVal; + } + NewPHI->eraseFromParent(); } - NewPHI->eraseFromParent(); } // Fix up the outgoing values - we need to add a value for the iteration @@ -905,11 +1002,14 @@ llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, /// this provides a benefit, since the peeled off iterations, which account /// for the bulk of dynamic execution, can be further simplified by scalar /// optimizations. -bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, +bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &LVMap) { assert(PeelCount > 0 && "Attempt to peel out zero iterations?"); assert(canPeel(L) && "Attempt to peel a loop which is not peelable?"); + assert((!PeelLast || (canPeelLastIteration(*L, *SE) && PeelCount == 1)) && + "when peeling the last iteration, the loop must be supported and can " + "only peel a single iteration"); LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); @@ -944,60 +1044,99 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, Function *F = Header->getParent(); - // Set up all the necessary basic blocks. It is convenient to split the - // preheader into 3 parts - two blocks to anchor the peeled copy of the loop - // body, and a new preheader for the "real" loop. - - // Peeling the first iteration transforms. - // - // PreHeader: - // ... - // Header: - // LoopBody - // If (cond) goto Header - // Exit: - // - // into - // - // InsertTop: - // LoopBody - // If (!cond) goto Exit - // InsertBot: - // NewPreHeader: - // ... - // Header: - // LoopBody - // If (cond) goto Header - // Exit: - // - // Each following iteration will split the current bottom anchor in two, - // and put the new copy of the loop body between these two blocks. That is, - // after peeling another iteration from the example above, we'll split - // InsertBot, and get: - // - // InsertTop: - // LoopBody - // If (!cond) goto Exit - // InsertBot: - // LoopBody - // If (!cond) goto Exit - // InsertBot.next: - // NewPreHeader: - // ... - // Header: - // LoopBody - // If (cond) goto Header - // Exit: - - BasicBlock *InsertTop = SplitEdge(PreHeader, Header, &DT, LI); - BasicBlock *InsertBot = - SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); - BasicBlock *NewPreHeader = - SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI); - - InsertTop->setName(Header->getName() + ".peel.begin"); - InsertBot->setName(Header->getName() + ".peel.next"); - NewPreHeader->setName(PreHeader->getName() + ".peel.newph"); + // Set up all the necessary basic blocks. + BasicBlock *InsertTop; + BasicBlock *InsertBot; + BasicBlock *NewPreHeader; + DenseMap<Instruction *, Value *> ExitValues; + if (PeelLast) { + // It is convenient to split the single exit block from the latch the + // into 3 parts - two blocks to anchor the peeled copy of the loop body, + // and a new final exit block. + + // Peeling the last iteration transforms. + // + // PreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + // into + // + // Header: + // LoopBody + // If (cond) goto Header + // InsertTop: + // LoopBody + // If (!cond) goto InsertBot + // InsertBot: + // Exit: + // ... + BasicBlock *Exit = L->getExitBlock(); + for (PHINode &P : Exit->phis()) + ExitValues[&P] = P.getIncomingValueForBlock(Latch); + + InsertTop = SplitEdge(Latch, Exit, &DT, LI); + InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); + + InsertTop->setName(Exit->getName() + ".peel.begin"); + InsertBot->setName(Exit->getName() + ".peel.next"); + } else { + // It is convenient to split the preheader into 3 parts - two blocks to + // anchor the peeled copy of the loop body, and a new preheader for the + // "real" loop. + + // Peeling the first iteration transforms. + // + // PreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + // into + // + // InsertTop: + // LoopBody + // If (!cond) goto Exit + // InsertBot: + // NewPreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + // Each following iteration will split the current bottom anchor in two, + // and put the new copy of the loop body between these two blocks. That + // is, after peeling another iteration from the example above, we'll + // split InsertBot, and get: + // + // InsertTop: + // LoopBody + // If (!cond) goto Exit + // InsertBot: + // LoopBody + // If (!cond) goto Exit + // InsertBot.next: + // NewPreHeader: + // ... + // Header: + // LoopBody + // If (cond) goto Header + // Exit: + // + InsertTop = SplitEdge(PreHeader, Header, &DT, LI); + InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI); + NewPreHeader = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI); + + InsertTop->setName(Header->getName() + ".peel.begin"); + InsertBot->setName(Header->getName() + ".peel.next"); + NewPreHeader->setName(PreHeader->getName() + ".peel.newph"); + } Instruction *LatchTerm = cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator()); @@ -1013,23 +1152,40 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes); // For each peeled-off iteration, make a copy of the loop. + ValueToValueMapTy VMap; for (unsigned Iter = 0; Iter < PeelCount; ++Iter) { SmallVector<BasicBlock *, 8> NewBlocks; - ValueToValueMapTy VMap; - cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks, - LoopBlocks, VMap, LVMap, &DT, LI, + cloneLoopBlocks(L, Iter, PeelLast, InsertTop, InsertBot, ExitEdges, + NewBlocks, LoopBlocks, VMap, LVMap, &DT, LI, LoopLocalNoAliasDeclScopes, *SE); // Remap to use values from the current iteration instead of the // previous one. remapInstructionsInBlocks(NewBlocks, VMap); - // Update IDoms of the blocks reachable through exits. - if (Iter == 0) - for (auto BBIDom : NonLoopBlocksIDom) - DT.changeImmediateDominator(BBIDom.first, - cast<BasicBlock>(LVMap[BBIDom.second])); + if (Iter == 0) { + if (PeelLast) { + // Adjust the exit condition so the loop exits one iteration early. + // For now we simply subtract one form the second operand of the + // exit condition. This relies on the peel count computation to + // check that this is actually legal. In particular, it ensures that + // the first operand of the compare is an AddRec with step 1 and we + // execute more than one iteration. + auto *Cmp = + cast<ICmpInst>(L->getLoopLatch()->getTerminator()->getOperand(0)); + IRBuilder B(Cmp); + Cmp->setOperand( + 1, B.CreateSub(Cmp->getOperand(1), + ConstantInt::get(Cmp->getOperand(1)->getType(), 1))); + } else { + // Update IDoms of the blocks reachable through exits. + for (auto BBIDom : NonLoopBlocksIDom) + DT.changeImmediateDominator(BBIDom.first, + cast<BasicBlock>(LVMap[BBIDom.second])); + } + } + #ifdef EXPENSIVE_CHECKS assert(DT.verify(DominatorTree::VerificationLevel::Fast)); #endif @@ -1052,16 +1208,24 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, F->end()); } - // Now adjust the phi nodes in the loop header to get their initial values - // from the last peeled-off iteration instead of the preheader. - for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *PHI = cast<PHINode>(I); - Value *NewVal = PHI->getIncomingValueForBlock(Latch); - Instruction *LatchInst = dyn_cast<Instruction>(NewVal); - if (LatchInst && L->contains(LatchInst)) - NewVal = LVMap[LatchInst]; + if (PeelLast) { + // Now adjust users of the original exit values by replacing them with the + // exit value from the peeled iteration. + for (const auto &[P, E] : ExitValues) + P->replaceAllUsesWith(VMap.lookup(E)); + formLCSSA(*L, DT, LI, SE); + } else { + // Now adjust the phi nodes in the loop header to get their initial values + // from the last peeled-off iteration instead of the preheader. + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *PHI = cast<PHINode>(I); + Value *NewVal = PHI->getIncomingValueForBlock(Latch); + Instruction *LatchInst = dyn_cast<Instruction>(NewVal); + if (LatchInst && L->contains(LatchInst)) + NewVal = LVMap[LatchInst]; - PHI->setIncomingValueForBlock(NewPreHeader, NewVal); + PHI->setIncomingValueForBlock(NewPreHeader, NewVal); + } } for (const auto &[Term, Info] : Weights) { @@ -1090,6 +1254,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA); NumPeeled++; + NumPeeledEnd += PeelLast; return true; } |