diff options
author | Florian Hahn <flo@fhahn.com> | 2024-05-02 11:01:24 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-02 11:01:24 +0100 |
commit | 175d2971020ceaad3e1adcf9bb92e4ebaaa449ee (patch) | |
tree | 5acfd1ca6adbd881adaf88bb9a441c803507757e /llvm/lib/Transforms/Utils/LoopUnroll.cpp | |
parent | b6328db80b4294cf83a3471ecdfab33003fa742d (diff) | |
download | llvm-175d2971020ceaad3e1adcf9bb92e4ebaaa449ee.zip llvm-175d2971020ceaad3e1adcf9bb92e4ebaaa449ee.tar.gz llvm-175d2971020ceaad3e1adcf9bb92e4ebaaa449ee.tar.bz2 |
[LoopUnroll] Add CSE to remove redundant loads after unrolling. (#83860)
This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based
on EarlyCSE's implementation using ScopeHashTable and is using SCEV for
accessed pointers to check to find redundant loads after unrolling.
This applies to the late unroll pass only, for full unrolling those
redundant loads will be cleaned up by the regular pipeline.
The current approach constructs MSSA on-demand per-loop, but there is
still small but notable compile-time impact:
stage1-O3 +0.04%
stage1-ReleaseThinLTO +0.06%
stage1-ReleaseLTO-g +0.05%
stage1-O0-g +0.02%
stage2-O3 +0.09%
stage2-O0-g +0.04%
stage2-clang +0.02%
https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u
This benefits some workloads with runtime-unrolling disabled,
where users use pragmas to force unrolling, as well as with
runtime unrolling enabled.
On SPEC/MultiSource, this removes a number of loads after unrolling
on AArch64 with runtime unrolling enabled.
```
External/S...te/526.blender_r/526.blender_r 96
MultiSourc...rks/mediabench/gsm/toast/toast 39
SingleSource/Benchmarks/Misc/ffbench 4
External/SPEC/CINT2006/403.gcc/403.gcc 18
MultiSourc.../Applications/JM/ldecod/ldecod 4
MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg 6
MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG 9
MultiSourc...e/Applications/ClamAV/clamscan 4
MultiSourc.../MallocBench/espresso/espresso 3
MultiSourc...dence-flt/LinearDependence-flt 2
MultiSourc...ch/office-ispell/office-ispell 4
MultiSourc...ch/consumer-jpeg/consumer-jpeg 6
MultiSourc...ench/security-sha/security-sha 11
MultiSourc...chmarks/McCat/04-bisect/bisect 3
SingleSour...tTests/2020-01-06-coverage-009 12
MultiSourc...ench/telecomm-gsm/telecomm-gsm 39
MultiSourc...lds-flt/CrossingThresholds-flt 24
MultiSourc...dence-dbl/LinearDependence-dbl 2
External/S...C/CINT2006/445.gobmk/445.gobmk 6
MultiSourc...enchmarks/mafft/pairlocalalign 53
External/S...31.deepsjeng_r/531.deepsjeng_r 3
External/S...rate/510.parest_r/510.parest_r 58
External/S...NT2006/464.h264ref/464.h264ref 29
External/S...NT2017rate/502.gcc_r/502.gcc_r 45
External/S...C/CINT2006/456.hmmer/456.hmmer 6
External/S...te/538.imagick_r/538.imagick_r 18
External/S.../CFP2006/447.dealII/447.dealII 4
MultiSourc...OE-ProxyApps-C++/miniFE/miniFE 12
External/S...2017rate/525.x264_r/525.x264_r 36
MultiSourc...Benchmarks/7zip/7zip-benchmark 33
MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk 2
MultiSourc...chmarks/VersaBench/8b10b/8b10b 1
MultiSourc.../Applications/JM/lencod/lencod 116
MultiSourc...lds-dbl/CrossingThresholds-dbl 24
MultiSource/Benchmarks/McCat/05-eks/eks 15
```
PR: https://github.com/llvm/llvm-project/pull/83860
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUnroll.cpp')
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopUnroll.cpp | 155 |
1 files changed, 147 insertions, 8 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 6f0d000..e9969ae 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -18,17 +18,20 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist_iterator.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/BasicBlock.h" @@ -209,13 +212,140 @@ static bool isEpilogProfitable(Loop *L) { return false; } +struct LoadValue { + Instruction *DefI = nullptr; + unsigned Generation = 0; + LoadValue() = default; + LoadValue(Instruction *Inst, unsigned Generation) + : DefI(Inst), Generation(Generation) {} +}; + +class StackNode { + ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope; + unsigned CurrentGeneration; + unsigned ChildGeneration; + DomTreeNode *Node; + DomTreeNode::const_iterator ChildIter; + DomTreeNode::const_iterator EndIter; + bool Processed = false; + +public: + StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads, + unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child, + DomTreeNode::const_iterator End) + : LoadScope(AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg), + Node(N), ChildIter(Child), EndIter(End) {} + // Accessors. + unsigned currentGeneration() const { return CurrentGeneration; } + unsigned childGeneration() const { return ChildGeneration; } + void childGeneration(unsigned generation) { ChildGeneration = generation; } + DomTreeNode *node() { return Node; } + DomTreeNode::const_iterator childIter() const { return ChildIter; } + + DomTreeNode *nextChild() { + DomTreeNode *Child = *ChildIter; + ++ChildIter; + return Child; + } + + DomTreeNode::const_iterator end() const { return EndIter; } + bool isProcessed() const { return Processed; } + void process() { Processed = true; } +}; + +Value *getMatchingValue(LoadValue LV, LoadInst *LI, unsigned CurrentGeneration, + BatchAAResults &BAA, + function_ref<MemorySSA *()> GetMSSA) { + if (!LV.DefI) + return nullptr; + if (LV.DefI->getType() != LI->getType()) + return nullptr; + if (LV.Generation != CurrentGeneration) { + MemorySSA *MSSA = GetMSSA(); + if (!MSSA) + return nullptr; + auto *EarlierMA = MSSA->getMemoryAccess(LV.DefI); + MemoryAccess *LaterDef = + MSSA->getWalker()->getClobberingMemoryAccess(LI, BAA); + if (!MSSA->dominates(LaterDef, EarlierMA)) + return nullptr; + } + return LV.DefI; +} + +void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI, + BatchAAResults &BAA, function_ref<MemorySSA *()> GetMSSA) { + ScopedHashTable<const SCEV *, LoadValue> AvailableLoads; + SmallVector<std::unique_ptr<StackNode>> NodesToProcess; + DomTreeNode *HeaderD = DT.getNode(L->getHeader()); + NodesToProcess.emplace_back(new StackNode(AvailableLoads, 0, HeaderD, + HeaderD->begin(), HeaderD->end())); + + unsigned CurrentGeneration = 0; + while (!NodesToProcess.empty()) { + StackNode *NodeToProcess = &*NodesToProcess.back(); + + CurrentGeneration = NodeToProcess->currentGeneration(); + + if (!NodeToProcess->isProcessed()) { + // Process the node. + + // If this block has a single predecessor, then the predecessor is the + // parent + // of the domtree node and all of the live out memory values are still + // current in this block. If this block has multiple predecessors, then + // they could have invalidated the live-out memory values of our parent + // value. For now, just be conservative and invalidate memory if this + // block has multiple predecessors. + if (!NodeToProcess->node()->getBlock()->getSinglePredecessor()) + ++CurrentGeneration; + for (auto &I : make_early_inc_range(*NodeToProcess->node()->getBlock())) { + + auto *Load = dyn_cast<LoadInst>(&I); + if (!Load || !Load->isSimple()) { + if (I.mayWriteToMemory()) + CurrentGeneration++; + continue; + } + + const SCEV *PtrSCEV = SE.getSCEV(Load->getPointerOperand()); + LoadValue LV = AvailableLoads.lookup(PtrSCEV); + if (Value *M = + getMatchingValue(LV, Load, CurrentGeneration, BAA, GetMSSA)) { + if (LI.replacementPreservesLCSSAForm(Load, M)) { + Load->replaceAllUsesWith(M); + Load->eraseFromParent(); + } + } else { + AvailableLoads.insert(PtrSCEV, LoadValue(Load, CurrentGeneration)); + } + } + NodeToProcess->childGeneration(CurrentGeneration); + NodeToProcess->process(); + } else if (NodeToProcess->childIter() != NodeToProcess->end()) { + // Push the next child onto the stack. + DomTreeNode *Child = NodeToProcess->nextChild(); + if (!L->contains(Child->getBlock())) + continue; + NodesToProcess.emplace_back( + new StackNode(AvailableLoads, NodeToProcess->childGeneration(), Child, + Child->begin(), Child->end())); + } else { + // It has been processed, and there are no more children to process, + // so delete it and pop it off the stack. + NodesToProcess.pop_back(); + } + } +} + /// Perform some cleanup and simplifications on loops after unrolling. It is /// useful to simplify the IV's in the new loop, as well as do a quick /// simplify/dce pass of the instructions. void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + AAResults *AA) { using namespace llvm::PatternMatch; // Simplify any new induction variables in the partially unrolled loop. @@ -230,6 +360,16 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) RecursivelyDeleteTriviallyDeadInstructions(Inst); } + + if (AA) { + std::unique_ptr<MemorySSA> MSSA = nullptr; + BatchAAResults BAA(*AA); + loadCSE(L, *DT, *SE, *LI, BAA, [L, AA, DT, &MSSA]() -> MemorySSA * { + if (!MSSA) + MSSA.reset(new MemorySSA(*L, AA, DT)); + return &*MSSA; + }); + } } // At this point, the code is well formed. Perform constprop, instsimplify, @@ -292,12 +432,11 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, /// /// If RemainderLoop is non-null, it will receive the remainder loop (if /// required and not fully unrolled). -LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, - const TargetTransformInfo *TTI, - OptimizationRemarkEmitter *ORE, - bool PreserveLCSSA, Loop **RemainderLoop) { +LoopUnrollResult +llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) { assert(DT && "DomTree is required"); if (!L->getLoopPreheader()) { @@ -852,7 +991,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // At this point, the code is well formed. We now simplify the unrolled loop, // doing constant propagation and dead code elimination as we go. simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC, - TTI); + TTI, AA); NumCompletelyUnrolled += CompletelyUnroll; ++NumUnrolled; |