aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Utils/LoopUnroll.cpp
diff options
context:
space:
mode:
authorFlorian Hahn <flo@fhahn.com>2024-05-02 11:01:24 +0100
committerGitHub <noreply@github.com>2024-05-02 11:01:24 +0100
commit175d2971020ceaad3e1adcf9bb92e4ebaaa449ee (patch)
tree5acfd1ca6adbd881adaf88bb9a441c803507757e /llvm/lib/Transforms/Utils/LoopUnroll.cpp
parentb6328db80b4294cf83a3471ecdfab33003fa742d (diff)
downloadllvm-175d2971020ceaad3e1adcf9bb92e4ebaaa449ee.zip
llvm-175d2971020ceaad3e1adcf9bb92e4ebaaa449ee.tar.gz
llvm-175d2971020ceaad3e1adcf9bb92e4ebaaa449ee.tar.bz2
[LoopUnroll] Add CSE to remove redundant loads after unrolling. (#83860)
This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based on EarlyCSE's implementation using ScopeHashTable and is using SCEV for accessed pointers to check to find redundant loads after unrolling. This applies to the late unroll pass only, for full unrolling those redundant loads will be cleaned up by the regular pipeline. The current approach constructs MSSA on-demand per-loop, but there is still small but notable compile-time impact: stage1-O3 +0.04% stage1-ReleaseThinLTO +0.06% stage1-ReleaseLTO-g +0.05% stage1-O0-g +0.02% stage2-O3 +0.09% stage2-O0-g +0.04% stage2-clang +0.02% https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u This benefits some workloads with runtime-unrolling disabled, where users use pragmas to force unrolling, as well as with runtime unrolling enabled. On SPEC/MultiSource, this removes a number of loads after unrolling on AArch64 with runtime unrolling enabled. ``` External/S...te/526.blender_r/526.blender_r 96 MultiSourc...rks/mediabench/gsm/toast/toast 39 SingleSource/Benchmarks/Misc/ffbench 4 External/SPEC/CINT2006/403.gcc/403.gcc 18 MultiSourc.../Applications/JM/ldecod/ldecod 4 MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg 6 MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG 9 MultiSourc...e/Applications/ClamAV/clamscan 4 MultiSourc.../MallocBench/espresso/espresso 3 MultiSourc...dence-flt/LinearDependence-flt 2 MultiSourc...ch/office-ispell/office-ispell 4 MultiSourc...ch/consumer-jpeg/consumer-jpeg 6 MultiSourc...ench/security-sha/security-sha 11 MultiSourc...chmarks/McCat/04-bisect/bisect 3 SingleSour...tTests/2020-01-06-coverage-009 12 MultiSourc...ench/telecomm-gsm/telecomm-gsm 39 MultiSourc...lds-flt/CrossingThresholds-flt 24 MultiSourc...dence-dbl/LinearDependence-dbl 2 External/S...C/CINT2006/445.gobmk/445.gobmk 6 MultiSourc...enchmarks/mafft/pairlocalalign 53 External/S...31.deepsjeng_r/531.deepsjeng_r 3 External/S...rate/510.parest_r/510.parest_r 58 External/S...NT2006/464.h264ref/464.h264ref 29 External/S...NT2017rate/502.gcc_r/502.gcc_r 45 External/S...C/CINT2006/456.hmmer/456.hmmer 6 External/S...te/538.imagick_r/538.imagick_r 18 External/S.../CFP2006/447.dealII/447.dealII 4 MultiSourc...OE-ProxyApps-C++/miniFE/miniFE 12 External/S...2017rate/525.x264_r/525.x264_r 36 MultiSourc...Benchmarks/7zip/7zip-benchmark 33 MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk 2 MultiSourc...chmarks/VersaBench/8b10b/8b10b 1 MultiSourc.../Applications/JM/lencod/lencod 116 MultiSourc...lds-dbl/CrossingThresholds-dbl 24 MultiSource/Benchmarks/McCat/05-eks/eks 15 ``` PR: https://github.com/llvm/llvm-project/pull/83860
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUnroll.cpp')
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp155
1 files changed, 147 insertions, 8 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 6f0d000..e9969ae 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -18,17 +18,20 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/BasicBlock.h"
@@ -209,13 +212,140 @@ static bool isEpilogProfitable(Loop *L) {
return false;
}
+struct LoadValue {
+ Instruction *DefI = nullptr;
+ unsigned Generation = 0;
+ LoadValue() = default;
+ LoadValue(Instruction *Inst, unsigned Generation)
+ : DefI(Inst), Generation(Generation) {}
+};
+
+class StackNode {
+ ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope;
+ unsigned CurrentGeneration;
+ unsigned ChildGeneration;
+ DomTreeNode *Node;
+ DomTreeNode::const_iterator ChildIter;
+ DomTreeNode::const_iterator EndIter;
+ bool Processed = false;
+
+public:
+ StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads,
+ unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child,
+ DomTreeNode::const_iterator End)
+ : LoadScope(AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg),
+ Node(N), ChildIter(Child), EndIter(End) {}
+ // Accessors.
+ unsigned currentGeneration() const { return CurrentGeneration; }
+ unsigned childGeneration() const { return ChildGeneration; }
+ void childGeneration(unsigned generation) { ChildGeneration = generation; }
+ DomTreeNode *node() { return Node; }
+ DomTreeNode::const_iterator childIter() const { return ChildIter; }
+
+ DomTreeNode *nextChild() {
+ DomTreeNode *Child = *ChildIter;
+ ++ChildIter;
+ return Child;
+ }
+
+ DomTreeNode::const_iterator end() const { return EndIter; }
+ bool isProcessed() const { return Processed; }
+ void process() { Processed = true; }
+};
+
+Value *getMatchingValue(LoadValue LV, LoadInst *LI, unsigned CurrentGeneration,
+ BatchAAResults &BAA,
+ function_ref<MemorySSA *()> GetMSSA) {
+ if (!LV.DefI)
+ return nullptr;
+ if (LV.DefI->getType() != LI->getType())
+ return nullptr;
+ if (LV.Generation != CurrentGeneration) {
+ MemorySSA *MSSA = GetMSSA();
+ if (!MSSA)
+ return nullptr;
+ auto *EarlierMA = MSSA->getMemoryAccess(LV.DefI);
+ MemoryAccess *LaterDef =
+ MSSA->getWalker()->getClobberingMemoryAccess(LI, BAA);
+ if (!MSSA->dominates(LaterDef, EarlierMA))
+ return nullptr;
+ }
+ return LV.DefI;
+}
+
+void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI,
+ BatchAAResults &BAA, function_ref<MemorySSA *()> GetMSSA) {
+ ScopedHashTable<const SCEV *, LoadValue> AvailableLoads;
+ SmallVector<std::unique_ptr<StackNode>> NodesToProcess;
+ DomTreeNode *HeaderD = DT.getNode(L->getHeader());
+ NodesToProcess.emplace_back(new StackNode(AvailableLoads, 0, HeaderD,
+ HeaderD->begin(), HeaderD->end()));
+
+ unsigned CurrentGeneration = 0;
+ while (!NodesToProcess.empty()) {
+ StackNode *NodeToProcess = &*NodesToProcess.back();
+
+ CurrentGeneration = NodeToProcess->currentGeneration();
+
+ if (!NodeToProcess->isProcessed()) {
+ // Process the node.
+
+ // If this block has a single predecessor, then the predecessor is the
+ // parent
+ // of the domtree node and all of the live out memory values are still
+ // current in this block. If this block has multiple predecessors, then
+ // they could have invalidated the live-out memory values of our parent
+ // value. For now, just be conservative and invalidate memory if this
+ // block has multiple predecessors.
+ if (!NodeToProcess->node()->getBlock()->getSinglePredecessor())
+ ++CurrentGeneration;
+ for (auto &I : make_early_inc_range(*NodeToProcess->node()->getBlock())) {
+
+ auto *Load = dyn_cast<LoadInst>(&I);
+ if (!Load || !Load->isSimple()) {
+ if (I.mayWriteToMemory())
+ CurrentGeneration++;
+ continue;
+ }
+
+ const SCEV *PtrSCEV = SE.getSCEV(Load->getPointerOperand());
+ LoadValue LV = AvailableLoads.lookup(PtrSCEV);
+ if (Value *M =
+ getMatchingValue(LV, Load, CurrentGeneration, BAA, GetMSSA)) {
+ if (LI.replacementPreservesLCSSAForm(Load, M)) {
+ Load->replaceAllUsesWith(M);
+ Load->eraseFromParent();
+ }
+ } else {
+ AvailableLoads.insert(PtrSCEV, LoadValue(Load, CurrentGeneration));
+ }
+ }
+ NodeToProcess->childGeneration(CurrentGeneration);
+ NodeToProcess->process();
+ } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+ // Push the next child onto the stack.
+ DomTreeNode *Child = NodeToProcess->nextChild();
+ if (!L->contains(Child->getBlock()))
+ continue;
+ NodesToProcess.emplace_back(
+ new StackNode(AvailableLoads, NodeToProcess->childGeneration(), Child,
+ Child->begin(), Child->end()));
+ } else {
+ // It has been processed, and there are no more children to process,
+ // so delete it and pop it off the stack.
+ NodesToProcess.pop_back();
+ }
+ }
+}
+
/// Perform some cleanup and simplifications on loops after unrolling. It is
/// useful to simplify the IV's in the new loop, as well as do a quick
/// simplify/dce pass of the instructions.
void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC,
- const TargetTransformInfo *TTI) {
+ const TargetTransformInfo *TTI,
+ AAResults *AA) {
using namespace llvm::PatternMatch;
// Simplify any new induction variables in the partially unrolled loop.
@@ -230,6 +360,16 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
RecursivelyDeleteTriviallyDeadInstructions(Inst);
}
+
+ if (AA) {
+ std::unique_ptr<MemorySSA> MSSA = nullptr;
+ BatchAAResults BAA(*AA);
+ loadCSE(L, *DT, *SE, *LI, BAA, [L, AA, DT, &MSSA]() -> MemorySSA * {
+ if (!MSSA)
+ MSSA.reset(new MemorySSA(*L, AA, DT));
+ return &*MSSA;
+ });
+ }
}
// At this point, the code is well formed. Perform constprop, instsimplify,
@@ -292,12 +432,11 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
///
/// If RemainderLoop is non-null, it will receive the remainder loop (if
/// required and not fully unrolled).
-LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
- ScalarEvolution *SE, DominatorTree *DT,
- AssumptionCache *AC,
- const TargetTransformInfo *TTI,
- OptimizationRemarkEmitter *ORE,
- bool PreserveLCSSA, Loop **RemainderLoop) {
+LoopUnrollResult
+llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+ const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
+ bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) {
assert(DT && "DomTree is required");
if (!L->getLoopPreheader()) {
@@ -852,7 +991,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// At this point, the code is well formed. We now simplify the unrolled loop,
// doing constant propagation and dead code elimination as we go.
simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC,
- TTI);
+ TTI, AA);
NumCompletelyUnrolled += CompletelyUnroll;
++NumUnrolled;