aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWei Mi <wmi@google.com>2018-08-06 17:30:45 +0000
committerWei Mi <wmi@google.com>2018-08-06 17:30:45 +0000
commit3c1c088500c980e99401189d020d65c79ef10d4a (patch)
tree0d923def3f61cd40174c55f50459e8ff3044ec38
parent1508dd8b867f8113e96520229fa46c1ce53c9d98 (diff)
downloadllvm-3c1c088500c980e99401189d020d65c79ef10d4a.zip
llvm-3c1c088500c980e99401189d020d65c79ef10d4a.tar.gz
llvm-3c1c088500c980e99401189d020d65c79ef10d4a.tar.bz2
[RegisterCoalescer] Delay live interval update work until the rematerialization
for all the uses from the same def is done. We run into a compile time problem with flex generated code combined with `-fno-jump-tables`. The cause is that machineLICM hoists a lot of invariants outside of a big loop, and drastically increases the compile time in global register splitting and copy coalescing. https://reviews.llvm.org/D49353 relieves the problem in global splitting. This patch is to handle the problem in copy coalescing. About the situation where the problem in copy coalescing happens. After machineLICM, we have several defs outside of a big loop with hundreds or thousands of uses inside the loop. Rematerialization in copy coalescing happens for each use and everytime rematerialization is done, shrinkToUses will be called to update the huge live interval. Because we have 'n' uses for a def, and each live interval update will have at least 'n' complexity, the total update work is n^2. To fix the problem, we try to do the live interval update work in a collective way. If a def has many copylike uses larger than a threshold, each time rematerialization is done for one of those uses, we won't do the live interval update in time but delay that work until rematerialization for all those uses are completed, so we only have to do the live interval update work once. Delaying the live interval update could potentially change the copy coalescing result, so we hope to limit that change to those defs with many (like above a hundred) copylike uses, and the cutoff can be adjusted by the option -mllvm -late-remat-update-threshold=xxx. Differential Revision: https://reviews.llvm.org/D49519 llvm-svn: 339035
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp63
-rw-r--r--llvm/test/CodeGen/X86/late-remat-update.mir118
2 files changed, 175 insertions, 6 deletions
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index cad13a6..ccb5348 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -16,6 +16,7 @@
#include "RegisterCoalescer.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -69,6 +70,7 @@ STATISTIC(NumReMats , "Number of instructions re-materialized");
STATISTIC(NumInflated , "Number of register classes inflated");
STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved");
+STATISTIC(NumShrinkToUses, "Number of shrinkToUses called");
static cl::opt<bool> EnableJoining("join-liveintervals",
cl::desc("Coalesce copies (default=true)"),
@@ -94,6 +96,15 @@ VerifyCoalescing("verify-coalescing",
cl::desc("Verify machine instrs before and after register coalescing"),
cl::Hidden);
+static cl::opt<unsigned> LateRematUpdateThreshold(
+ "late-remat-update-threshold", cl::Hidden,
+ cl::desc("During rematerialization for a copy, if the def instruction has "
+ "many other copy uses to be rematerialized, delay the multiple "
+ "separate live interval update work and do them all at once after "
+ "all those rematerialization are done. It will save a lot of "
+ "repeated work. "),
+ cl::init(100));
+
namespace {
class RegisterCoalescer : public MachineFunctionPass,
@@ -137,6 +148,11 @@ namespace {
/// Virtual registers to be considered for register class inflation.
SmallVector<unsigned, 8> InflateRegs;
+ /// The collection of live intervals which should have been updated
+ /// immediately after rematerialiation but delayed until
+ /// lateLiveIntervalUpdate is called.
+ DenseSet<unsigned> ToBeUpdated;
+
/// Recursively eliminate dead defs in DeadDefs.
void eliminateDeadDefs();
@@ -157,6 +173,13 @@ namespace {
/// was made.
bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
+ /// If one def has many copy like uses, and those copy uses are all
+ /// rematerialized, the live interval update needed for those
+ /// rematerializations will be delayed and done all at once instead
+ /// of being done multiple times. This is to save compile cost becuase
+ /// live interval update is costly.
+ void lateLiveIntervalUpdate();
+
/// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
/// src/dst of the copy instruction CopyMI. This returns true if the copy
/// was successfully coalesced away. If it is not currently possible to
@@ -258,6 +281,7 @@ namespace {
/// mentioned method returns true.
void shrinkToUses(LiveInterval *LI,
SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
+ NumShrinkToUses++;
if (LIS->shrinkToUses(LI, Dead)) {
/// Check whether or not \p LI is composed by multiple connected
/// components and if that is the case, fix that.
@@ -1365,11 +1389,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
LLVM_DEBUG(dbgs() << "Remat: " << NewMI);
++NumReMats;
- // The source interval can become smaller because we removed a use.
- shrinkToUses(&SrcInt, &DeadDefs);
- if (!DeadDefs.empty()) {
- // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
- // to describe DstReg instead.
+ // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
+ // to describe DstReg instead.
+ if (MRI->use_nodbg_empty(SrcReg)) {
for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
MachineInstr *UseMI = UseMO.getParent();
if (UseMI->isDebugValue()) {
@@ -1380,9 +1402,24 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
LLVM_DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
}
}
- eliminateDeadDefs();
}
+ if (ToBeUpdated.count(SrcReg))
+ return true;
+
+ long NumCopyUses = 0;
+ for (MachineOperand &UseMO : MRI->use_nodbg_operands(SrcReg)) {
+ if (UseMO.getParent()->isCopyLike())
+ NumCopyUses++;
+ }
+ if (NumCopyUses < LateRematUpdateThreshold) {
+ // The source interval can become smaller because we removed a use.
+ shrinkToUses(&SrcInt, &DeadDefs);
+ if (!DeadDefs.empty())
+ eliminateDeadDefs();
+ } else {
+ ToBeUpdated.insert(SrcReg);
+ }
return true;
}
@@ -3290,6 +3327,18 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
|| LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
}
+void RegisterCoalescer::lateLiveIntervalUpdate() {
+ for (unsigned reg : ToBeUpdated) {
+ if (!LIS->hasInterval(reg))
+ continue;
+ LiveInterval &LI = LIS->getInterval(reg);
+ shrinkToUses(&LI, &DeadDefs);
+ if (!DeadDefs.empty())
+ eliminateDeadDefs();
+ }
+ ToBeUpdated.clear();
+}
+
bool RegisterCoalescer::
copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
bool Progress = false;
@@ -3459,12 +3508,14 @@ void RegisterCoalescer::joinAllIntervals() {
}
copyCoalesceInMBB(MBBs[i].MBB);
}
+ lateLiveIntervalUpdate();
coalesceLocals();
// Joining intervals can allow other intervals to be joined. Iteratively join
// until we make no progress.
while (copyCoalesceWorkList(WorkList))
/* empty */ ;
+ lateLiveIntervalUpdate();
}
void RegisterCoalescer::releaseMemory() {
diff --git a/llvm/test/CodeGen/X86/late-remat-update.mir b/llvm/test/CodeGen/X86/late-remat-update.mir
new file mode 100644
index 0000000..9ab02747a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/late-remat-update.mir
@@ -0,0 +1,118 @@
+# REQUIRES: asserts
+# RUN: llc -mtriple=x86_64-- -run-pass=simple-register-coalescing -late-remat-update-threshold=1 -stats %s -o /dev/null 2>&1 | FileCheck %s
+# Check the test will rematerialize for three copies, but will call shrinkToUses
+# only once to update live range because of late rematerialization update.
+# CHECK: 3 regalloc - Number of instructions re-materialized
+# CHECK: 1 regalloc - Number of shrinkToUses called
+--- |
+ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-linux-gnu"
+
+ ; Function Attrs: noreturn uwtable
+ define void @_Z3fooi(i32 %value) local_unnamed_addr #0 {
+ entry:
+ br label %do.body
+
+ do.body: ; preds = %do.body, %sw.bb2, %entry
+ tail call void asm sideeffect "", "~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !3
+ switch i32 %value, label %do.body [
+ i32 0, label %sw.bb
+ i32 1, label %sw.bb1
+ i32 2, label %sw.bb2
+ ]
+
+ sw.bb: ; preds = %do.body
+ tail call void @_Z3gooi(i32 2122)
+ br label %sw.bb1
+
+ sw.bb1: ; preds = %sw.bb, %do.body
+ tail call void @_Z3gooi(i32 2122)
+ br label %sw.bb2
+
+ sw.bb2: ; preds = %sw.bb1, %do.body
+ tail call void @_Z3gooi(i32 2122)
+ br label %do.body
+ }
+
+ declare void @_Z3gooi(i32) local_unnamed_addr #1
+
+ ; Function Attrs: nounwind
+ declare void @llvm.stackprotector(i8*, i8**) #2
+
+ attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #2 = { nounwind }
+
+ !llvm.module.flags = !{!0, !1}
+ !llvm.ident = !{!2}
+
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 7, !"PIC Level", i32 2}
+ !2 = !{!"clang version 7.0.0 (trunk 335057)"}
+ !3 = !{i32 82}
+
+...
+---
+name: _Z3fooi
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr32 }
+ - { id: 2, class: gr32 }
+ - { id: 3, class: gr32 }
+ - { id: 4, class: gr32 }
+ - { id: 5, class: gr32 }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+frameInfo:
+ hasCalls: true
+body: |
+ bb.0.entry:
+ liveins: $edi
+
+ %0:gr32 = COPY killed $edi
+ %5:gr32 = MOV32ri 2122
+
+ bb.1.do.body:
+ successors: %bb.6(0x15555555), %bb.2(0x6aaaaaab)
+
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15, 12, implicit-def dead early-clobber $eflags, !3
+ CMP32ri8 %0, 2, implicit-def $eflags
+ JE_1 %bb.6, implicit killed $eflags
+ JMP_1 %bb.2
+
+ bb.2.do.body:
+ successors: %bb.5(0x19999999), %bb.3(0x66666667)
+
+ CMP32ri8 %0, 1, implicit-def $eflags
+ JE_1 %bb.5, implicit killed $eflags
+ JMP_1 %bb.3
+
+ bb.3.do.body:
+ successors: %bb.4(0x20000000), %bb.1(0x60000000)
+
+ TEST32rr %0, %0, implicit-def $eflags
+ JNE_1 %bb.1, implicit killed $eflags
+ JMP_1 %bb.4
+
+ bb.4.sw.bb:
+ ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ $edi = COPY %5
+ CALL64pcrel32 @_Z3gooi, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $rsp, implicit-def $ssp
+ ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+ bb.5.sw.bb1:
+ ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ $edi = COPY %5
+ CALL64pcrel32 @_Z3gooi, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $rsp, implicit-def $ssp
+ ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+ bb.6.sw.bb2:
+ ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ $edi = COPY %5
+ CALL64pcrel32 @_Z3gooi, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $rsp, implicit-def $ssp
+ ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ JMP_1 %bb.1
+
+...