diff options
Diffstat (limited to 'llvm/lib/Transforms/Utils')
| -rw-r--r-- | llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 73 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/ControlFlowUtils.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/FixIrreducible.cpp | 126 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/LoopUnroll.cpp | 59 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp | 118 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/LoopUtils.cpp | 48 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 35 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/UnifyLoopExits.cpp | 77 |
8 files changed, 441 insertions, 100 deletions
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 9829d4d..11db0ec 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName); } +/// Helper function to update the cycle or loop information after inserting a +/// new block between a callbr instruction and one of its target blocks. Adds +/// the new block to the innermost cycle or loop that the callbr instruction and +/// the original target block share. +/// \p LCI cycle or loop information to update +/// \p CallBrBlock block containing the callbr instruction +/// \p CallBrTarget new target block of the callbr instruction +/// \p Succ original target block of the callbr instruction +template <typename TI, typename T> +static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock, + BasicBlock *CallBrTarget, BasicBlock *Succ) { + static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>, + "type must be CycleInfo or LoopInfo"); + if (!LCI) + return false; + + T *LC; + if constexpr (std::is_same_v<TI, CycleInfo>) + LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ); + else + LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ); + if (!LC) + return false; + + if constexpr (std::is_same_v<TI, CycleInfo>) + LCI->addBlockToCycle(CallBrTarget, LC); + else + LC->addBasicBlockToLoop(CallBrTarget, *LCI); + + return true; +} + +BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, + unsigned SuccIdx, DomTreeUpdater *DTU, + CycleInfo *CI, LoopInfo *LI, + bool *UpdatedLI) { + CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator()); + assert(CallBr && "expected callbr terminator"); + assert(SuccIdx < CallBr->getNumSuccessors() && + Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index"); + + // Create a new block between callbr and the specified successor. + // splitBlockBefore cannot be re-used here since it cannot split if the split + // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot + // handle that). But we don't need to rewire every part of a potential PHI + // node. We only care about the edge between CallBrBlock and the original + // successor. + BasicBlock *CallBrTarget = + BasicBlock::Create(CallBrBlock->getContext(), + CallBrBlock->getName() + ".target." + Succ->getName(), + CallBrBlock->getParent()); + // Rewire control flow from the new target block to the original successor. + Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget); + // Rewire control flow from callbr to the new target block. + CallBr->setSuccessor(SuccIdx, CallBrTarget); + // Jump from the new target block to the original successor. + BranchInst::Create(Succ, CallBrTarget); + + bool Updated = + updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ); + if (UpdatedLI) + *UpdatedLI = Updated; + updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ); + if (DTU) { + DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}}); + if (DTU->getDomTree().dominates(CallBrBlock, Succ)) + DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ}, + {DominatorTree::Insert, CallBrTarget, Succ}}); + } + + return CallBrTarget; +} + void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) { if (auto *II = dyn_cast<InvokeInst>(TI)) II->setUnwindDest(Succ); diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 0046a00..287a177 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/ControlFlowUtils.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/ValueHandle.h" @@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize( for (auto [BB, Succ0, Succ1] : Branches) { #ifndef NDEBUG - assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); + assert( + (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) && + "Duplicate entry for incoming block."); #endif if (Succ0) Outgoing.insert(Succ0); diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 45e1d12..804af22 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -79,6 +79,53 @@ // Limitation: The pass cannot handle switch statements and indirect // branches. Both must be lowered to plain branches first. // +// CallBr support: CallBr is handled as a more general branch instruction which +// can have multiple successors. The pass redirects the edges to intermediate +// target blocks that unconditionally branch to the original callbr target +// blocks. This allows the control flow hub to know to which of the original +// target blocks to jump to. +// Example input CFG: +// Entry (callbr) +// / \ +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// becomes: +// Entry (callbr) +// / \ +// v v +// target.H target.B +// | | +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// Note +// OUTPUT CFG: Converted to a natural loop with a new header N. +// +// Entry (callbr) +// / \ +// v v +// target.H target.B +// \ / +// \ / +// v v +// N <---. +// / \ \ +// / \ | +// v v / +// H --> B --' +// | +// v +// Exit +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/FixIrreducible.h" @@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, return false; LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); ControlFlowHub CHub; SetVector<BasicBlock *> Predecessors; @@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast<BranchInst>(P->getTerminator()); - // Exactly one of the two successors is the header. - BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; - BasicBlock *Succ1 = Succ0 ? nullptr : Header; - if (!Succ0) - assert(Branch->getSuccessor(1) == Header); - assert(Succ0 || Succ1); - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) { + // Exactly one of the two successors is the header. + BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; + BasicBlock *Succ1 = Succ0 ? nullptr : Header; + assert(Succ0 || Branch->getSuccessor(1) == Header); + assert(Succ0 || Succ1); + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (Succ != Header) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added internal branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect external incoming edges. This includes the edges on the header. @@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast<BranchInst>(P->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = C.contains(Succ0) ? Succ0 : nullptr; - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = C.contains(Succ0) ? Succ0 : nullptr; + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (!C.contains(Succ)) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added external branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect all the backedges through a "hub" consisting of a series @@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, SetVector<BasicBlock *> Entries; Entries.insert(C.entry_rbegin(), C.entry_rend()); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); CHub.finalize(&DTU, GuardBlocks, "irr"); #if defined(EXPENSIVE_CHECKS) assert(DT.verify(DominatorTree::VerificationLevel::Full)); @@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT, LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: " << F.getName() << "\n"); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - bool Changed = false; for (Cycle *TopCycle : CI.toplevel_cycles()) { for (Cycle *C : depth_first(TopCycle)) { diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4fe736a..94dfd3a 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); - unsigned EstimatedLoopInvocationWeight = 0; std::optional<unsigned> OriginalTripCount = - llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight); + llvm::getLoopEstimatedTripCount(L); + BranchProbability OriginalLoopProb = llvm::getLoopProbability(L); // Effectively "DCE" unrolled iterations that are beyond the max tripcount // and will never be executed. @@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, : isEpilogProfitable(L); if (ULO.Runtime && - !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, - EpilogProfitability, ULO.UnrollRemainder, - ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, - PreserveLCSSA, ULO.SCEVExpansionBudget, - ULO.RuntimeUnrollMultiExit, RemainderLoop)) { + !UnrollRuntimeLoopRemainder( + L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, + ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, + PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit, + RemainderLoop, OriginalTripCount, OriginalLoopProb)) { if (ULO.Force) ULO.Runtime = false; else { @@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, LI->erase(L); // We shouldn't try to use `L` anymore. L = nullptr; - } else if (OriginalTripCount) { - // Update the trip count. Note that the remainder has already logic - // computing it in `UnrollRuntimeLoopRemainder`. - setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count, - EstimatedLoopInvocationWeight); + } else { + // Update metadata for the loop's branch weights and estimated trip count: + // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch + // weights, latch branch weights, and estimated trip count of the + // remainder loop it creates. It also sets the branch weights for the + // unrolled loop guard it creates. The branch weights for the unrolled + // loop latch are adjusted below. FIXME: Handle prologue loops. + // - Otherwise, if unrolled loop iteration latches become unconditional, + // branch weights are adjusted above. FIXME: Actually handle such + // unconditional latches. + // - Otherwise, the original loop's branch weights are correct for the + // unrolled loop, so do not adjust them. + // - In all cases, the unrolled loop's estimated trip count is set below. + // + // As an example of the last case, consider what happens if the unroll count + // is 4 for a loop with an estimated trip count of 10 when we do not create + // a remainder loop and all iterations' latches remain conditional. Each + // unrolled iteration's latch still has the same probability of exiting the + // loop as it did when in the original loop, and thus it should still have + // the same branch weights. Each unrolled iteration's non-zero probability + // of exiting already appropriately reduces the probability of reaching the + // remaining iterations just as it did in the original loop. Trying to also + // adjust the branch weights of the final unrolled iteration's latch (i.e., + // the backedge for the unrolled loop as a whole) to reflect its new trip + // count of 3 will erroneously further reduce its block frequencies. + // However, in case an analysis later needs to estimate the trip count of + // the unrolled loop as a whole without considering the branch weights for + // each unrolled iteration's latch within it, we store the new trip count as + // separate metadata. + if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { + // Where p is always the probability of executing at least 1 more + // iteration, the probability for at least n more iterations is p^n. + setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); + } + if (OriginalTripCount) { + unsigned NewTripCount = *OriginalTripCount / ULO.Count; + if (!ULO.Runtime && *OriginalTripCount % ULO.Count) + ++NewTripCount; + setLoopEstimatedTripCount(L, NewTripCount); + } } // LoopInfo should not be valid, confirm that. diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 6312831..1e8f6cc 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cmath> using namespace llvm; @@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, } } +/// Assume, due to our position in the remainder loop or its guard, anywhere +/// from 0 to \p N more iterations can possibly execute. Among such cases in +/// the original loop (with loop probability \p OriginalLoopProb), what is the +/// probability of executing at least one more iteration? +static BranchProbability +probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // Each of these variables holds the original loop's probability that the + // number of iterations it will execute is some m in the specified range. + BranchProbability ProbOne = OriginalLoopProb; // 1 <= m + BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m + BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N + BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N + return ProbOneNotTooMany / ProbNotTooMany; +} + /// Connect the unrolling epilog code to the original loop. /// The unrolling epilog code contains code to execute the /// 'extra' iterations if the run-time trip count modulo the @@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count, AssumptionCache &AC) { + unsigned Count, AssumptionCache &AC, + BranchProbability OriginalLoopProb) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, PreserveLCSSA); // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if (OriginalLoopProb.isUnknown() && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(1, Count - 1); } - B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + BranchInst *RemainderLoopGuard = + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + if (!OriginalLoopProb.isUnknown()) { + setBranchProbability(RemainderLoopGuard, + probOfNextInRemainder(OriginalLoopProb, Count - 1), + /*ForFirstTarget=*/true); + } InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, /// The cloned blocks should be inserted between InsertTop and InsertBot. /// InsertTop should be new preheader, InsertBot new loop exit. /// Returns the new cloned loop that is created. -static Loop * -CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, - const bool UnrollRemainder, - BasicBlock *InsertTop, - BasicBlock *InsertBot, BasicBlock *Preheader, +static Loop *CloneLoopBlocks(Loop *L, Value *NewIter, + const bool UseEpilogRemainder, + const bool UnrollRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI, unsigned Count) { + DominatorTree *DT, LoopInfo *LI, unsigned Count, + std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next"); Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp"); MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*LatchBR)) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*LatchBR)) { uint32_t ExitWeight; uint32_t BackEdgeWeight; if (Count >= 3) { @@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, MDBuilder MDB(Builder.getContext()); BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight); } - Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + BranchInst *RemainderLoopLatch = + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // Compute the total frequency of the original loop body from the + // remainder iterations. Once we've reached them, the first of them + // always executes, so its frequency and probability are 1. + double FreqRemIters = 1; + if (Count > 2) { + BranchProbability ProbReaching = BranchProbability::getOne(); + for (unsigned N = Count - 2; N >= 1; --N) { + ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N); + FreqRemIters += double(ProbReaching.getNumerator()) / + ProbReaching.getDenominator(); + } + } + // Solve for the loop probability that would produce that frequency. + // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters. + double ProbDouble = 1 - 1 / FreqRemIters; + BranchProbability Prob = BranchProbability::getBranchProbability( + std::round(ProbDouble * BranchProbability::getDenominator()), + BranchProbability::getDenominator()); + setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true); + } NewIdx->addIncoming(Zero, InsertTop); NewIdx->addIncoming(IdxNext, NewBB); LatchBR->eraseFromParent(); @@ -460,25 +508,13 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); - MDNode *LoopID = NewLoop->getLoopID(); - // Only add loop metadata if the loop is not going to be completely - // unrolled. - if (UnrollRemainder) - return NewLoop; - - std::optional<MDNode *> NewLoopID = makeFollowupLoopID( - LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); - if (NewLoopID) { - NewLoop->setLoopID(*NewLoopID); - - // Do not setLoopAlreadyUnrolled if loop attributes have been defined - // explicitly. - return NewLoop; - } + if (OriginalTripCount && UseEpilogRemainder) + setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count); // Add unroll disable metadata to disable future unrolling for this loop. - NewLoop->setLoopAlreadyUnrolled(); + if (!UnrollRemainder) + NewLoop->setLoopAlreadyUnrolled(); return NewLoop; } @@ -603,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop) { + Loop **ResultLoop, std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -823,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder( BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume loop is nearly always entered. MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights); } - B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + BranchInst *UnrollingLoopGuard = + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // The original loop's first iteration always happens. Compute the + // probability of the original loop executing Count-1 iterations after that + // to complete the first iteration of the unrolled loop. + BranchProbability ProbOne = OriginalLoopProb; + BranchProbability ProbRest = ProbOne.pow(Count - 1); + setBranchProbability(UnrollingLoopGuard, ProbRest, + /*ForFirstTarget=*/false); + } PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) @@ -855,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder( // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; - Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, - NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); + Loop *remainderLoop = + CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, + LI, Count, OriginalTripCount, OriginalLoopProb); // Insert the cloned blocks into the function. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -956,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC, + OriginalLoopProb); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba822..8be471b 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -962,13 +962,51 @@ bool llvm::setLoopEstimatedTripCount( if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); - MDBuilder MDB(LatchBranch->getContext()); - // Set/Update profile metadata. - LatchBranch->setMetadata( - LLVMContext::MD_prof, - MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight}, + /*IsExpected=*/false); + + return true; +} + +BranchProbability llvm::getLoopProbability(Loop *L) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return BranchProbability::getUnknown(); + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return getBranchProbability(LatchBranch, FirstTargetIsLoop); +} +bool llvm::setLoopProbability(Loop *L, BranchProbability P) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return setBranchProbability(LatchBranch, P, FirstTargetIsLoop); +} + +BranchProbability llvm::getBranchProbability(BranchInst *B, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return BranchProbability::getUnknown(); + uint64_t Weight0, Weight1; + if (!extractBranchWeights(*B, Weight0, Weight1)) + return BranchProbability::getUnknown(); + if (!ForFirstTarget) + std::swap(Weight0, Weight1); + return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); +} + +bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return false; + BranchProbability Prob0 = P; + BranchProbability Prob1 = P.getCompl(); + if (!ForFirstTarget) + std::swap(Prob0, Prob1); + setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()}, + /*IsExpected=*/false); return true; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b03fb62..6addcfa 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -80,6 +80,7 @@ #include <algorithm> #include <cassert> #include <climits> +#include <cmath> #include <cstddef> #include <cstdint> #include <iterator> @@ -5977,14 +5978,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Prune obsolete incoming values off the successors' PHI nodes. - for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(Dest->phis())) { unsigned PreviousEdges = Cases->size(); if (Dest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } - for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(OtherDest->phis())) { unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; @@ -5993,7 +5994,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, if (NewBI->isUnconditional()) ++E; for (unsigned I = 0; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } // Clean up the default block - it may have phis or other instructions before @@ -7632,7 +7633,33 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); + SmallVector<uint32_t> Weights; + auto HasWeights = + !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights); auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) { + // IsPow2 covers a subset of the cases in which we'd go to the default + // label. The other is those powers of 2 that don't appear in the case + // statement. We don't know the distribution of the values coming in, so + // the safest is to split 50-50 the original probability to `default`. + uint64_t OrigDenominator = sum_of(map_range( + Weights, [](const auto &V) { return static_cast<uint64_t>(V); })); + SmallVector<uint64_t> NewWeights(2); + NewWeights[1] = Weights[0] / 2; + NewWeights[0] = OrigDenominator - NewWeights[1]; + setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); + + // For the original switch, we reduce the weight of the default by the + // amount by which the previous branch contributes to getting to default, + // and then make sure the remaining weights have the same relative ratio + // wrt eachother. + uint64_t CasesDenominator = OrigDenominator - Weights[0]; + Weights[0] /= 2; + for (auto &W : drop_begin(Weights)) + W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; + + setBranchWeights(*SI, Weights, /*IsExpected=*/false); + } // BI is handling the default case for SI, and so should share its DebugLoc. BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 9f338db..94c5c170 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -12,7 +12,11 @@ // // Limitation: This assumes that all terminators in the CFG are direct branches // (the "br" instruction). The presence of any other control flow -// such as indirectbr, switch or callbr will cause an assert. +// such as indirectbr or switch will cause an assert. +// The callbr terminator is supported by creating intermediate +// target blocks that unconditionally branch to the original target +// blocks. These intermediate target blocks can then be redirected +// through the ControlFlowHub as usual. // //===----------------------------------------------------------------------===// @@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; - for (auto *BB : ExitingBlocks) { - auto *Branch = cast<BranchInst>(BB->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = L->contains(Succ0) ? nullptr : Succ0; - - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = L->contains(Succ1) ? nullptr : Succ1; - CHub.addBranch(BB, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" - << (Succ0 ? Succ0->getName() : "<none>") << ", " - << (Succ1 ? Succ1->getName() : "<none>") << "}\n"); + + for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { + BasicBlock *BB = ExitingBlocks[I]; + if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = L->contains(Succ0) ? nullptr : Succ0; + + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = L->contains(Succ1) ? nullptr : Succ1; + CHub.addBranch(BB, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) { + for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) { + BasicBlock *Succ = CallBr->getSuccessor(J); + if (L->contains(Succ)) + continue; + bool UpdatedLI = false; + BasicBlock *NewSucc = + SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // Even if CallBr and Succ do not have a common parent loop, we need to + // add the new target block to the parent loop of the current loop. + if (!UpdatedLI) + CallBrTargetBlocksToFix.push_back(NewSucc); + // ExitingBlocks is later used to restore SSA, so we need to make sure + // that the blocks used for phi nodes in the guard blocks match the + // predecessors of the guard blocks, which, in the case of callbr, are + // the new intermediate target blocks instead of the callbr blocks + // themselves. + ExitingBlocks[I] = NewSucc; + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added exiting branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } SmallVector<BasicBlock *, 8> GuardBlocks; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); BasicBlock *LoopExitBlock; bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( @@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { // The guard blocks were created outside the loop, so they need to become // members of the parent loop. - if (auto ParentLoop = L->getParentLoop()) { + // Same goes for the callbr target blocks. Although we try to add them to the + // smallest common parent loop of the callbr block and the corresponding + // original target block, there might not have been such a loop, in which case + // the newly created callbr target blocks are not part of any loop. For nested + // loops, this might result in them leading to a loop with multiple entry + // points. + if (auto *ParentLoop = L->getParentLoop()) { for (auto *G : GuardBlocks) { ParentLoop->addBasicBlockToLoop(G, LI); } + for (auto *C : CallBrTargetBlocksToFix) { + ParentLoop->addBasicBlockToLoop(C, LI); + } ParentLoop->verifyLoop(); } @@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) { auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - return runImpl(LI, DT); } |
