diff options
author | Jay Foad <jay.foad@amd.com> | 2023-07-28 11:12:59 +0100 |
---|---|---|
committer | Jay Foad <jay.foad@amd.com> | 2023-08-07 15:41:40 +0100 |
commit | 56d92c17583e5f0b5e1e521b5f614be79436fccc (patch) | |
tree | a01b5cd27b41175516118c52e9708defd9fe9dda | |
parent | 97324f6274184e607fa6d6cffb1aebee317d4644 (diff) | |
download | llvm-56d92c17583e5f0b5e1e521b5f614be79436fccc.zip llvm-56d92c17583e5f0b5e1e521b5f614be79436fccc.tar.gz llvm-56d92c17583e5f0b5e1e521b5f614be79436fccc.tar.bz2 |
[MachineScheduler] Track physical register dependencies per-regunit
Change the scheduler's physical register dependency tracking from
registers-and-their-aliases to regunits. This has a couple of advantages
when subregisters are used:
- The dependency tracking is more accurate and creates fewer useless
edges in the dependency graph. An AMDGPU example, edited for clarity:
SU(0): $vgpr1 = V_MOV_B32 $sgpr0
SU(1): $vgpr1 = V_ADDC_U32 0, $vgpr1
SU(2): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0
There is a data dependency on $vgpr1 from SU(0) to SU(1) and from
SU(1) to SU(2). But the old dependency tracking code also added a
useless edge from SU(0) to SU(2) because it thought that SU(0)'s def
of $vgpr1 aliased with SU(2)'s use of $vgpr0_vgpr1.
- On targets like AMDGPU that make heavy use of subregisters, each
register can have a huge number of aliases - it can be quadratic in
the size of the largest defined register tuple. There is a much lower
bound on the number of regunits per register, so iterating over
regunits is faster than iterating over aliases.
The LLVM compile-time tracker shows a tiny overall improvement of 0.03%
on X86. I expect a larger compile-time improvement on targets like
AMDGPU.
Recommit after fixing AggressiveAntiDepBreaker in D156880.
Differential Revision: https://reviews.llvm.org/D156552
-rw-r--r-- | llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 7 | ||||
-rw-r--r-- | llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 65 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vldst4.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vst3.ll | 2 |
9 files changed, 52 insertions, 50 deletions
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index 5ea68e0..abffcd5 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -77,11 +77,12 @@ namespace llvm { struct PhysRegSUOper { SUnit *SU; int OpIdx; - unsigned Reg; + unsigned RegUnit; - PhysRegSUOper(SUnit *su, int op, unsigned R): SU(su), OpIdx(op), Reg(R) {} + PhysRegSUOper(SUnit *su, int op, unsigned R) + : SU(su), OpIdx(op), RegUnit(R) {} - unsigned getSparseSetIndex() const { return Reg; } + unsigned getSparseSetIndex() const { return RegUnit; } }; /// Use a SparseMultiSet to track physical registers. Storage is only diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 37a1ef0..a42f842 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -211,7 +211,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { for (const MachineOperand &MO : ExitMI->all_uses()) { Register Reg = MO.getReg(); if (Reg.isPhysical()) { - Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); + for (MCRegUnit Unit : TRI->regunits(Reg)) + Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit)); } else if (Reg.isVirtual() && MO.readsReg()) { addVRegUseDeps(&ExitSU, MO.getOperandNo()); } @@ -222,8 +223,11 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { // uses all the registers that are livein to the successor blocks. for (const MachineBasicBlock *Succ : BB->successors()) { for (const auto &LI : Succ->liveins()) { - if (!Uses.contains(LI.PhysReg)) - Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg)); + // TODO: Use LI.LaneMask to refine this. + for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) { + if (!Uses.contains(Unit)) + Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit)); + } } } } @@ -244,8 +248,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc(); bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() && !DefMIDesc.hasImplicitDefOfPhysReg(Reg)); - for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { - for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) { + for (MCRegUnit Unit : TRI->regunits(Reg)) { + for (Reg2SUnitsMap::iterator I = Uses.find(Unit); I != Uses.end(); ++I) { SUnit *UseSU = I->SU; if (UseSU == SU) continue; @@ -262,11 +266,14 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { // Set the hasPhysRegDefs only for physreg defs that have a use within // the scheduling region. SU->hasPhysRegDefs = true; + UseInstr = UseSU->getInstr(); + Register UseReg = UseInstr->getOperand(UseOpIdx).getReg(); const MCInstrDesc &UseMIDesc = UseInstr->getDesc(); - ImplicitPseudoUse = (UseOpIdx >= ((int)UseMIDesc.getNumOperands()) && - !UseMIDesc.hasImplicitUseOfPhysReg(*Alias)); - Dep = SDep(SU, SDep::Data, *Alias); + ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) && + !UseMIDesc.hasImplicitUseOfPhysReg(UseReg); + + Dep = SDep(SU, SDep::Data, UseReg); } if (!ImplicitPseudoDef && !ImplicitPseudoUse) { Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, @@ -300,15 +307,16 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { // TODO: Using a latency of 1 here for output dependencies assumes // there's no cost for reusing registers. SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; - for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { - for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) { + for (MCRegUnit Unit : TRI->regunits(Reg)) { + for (Reg2SUnitsMap::iterator I = Defs.find(Unit); I != Defs.end(); ++I) { SUnit *DefSU = I->SU; if (DefSU == &ExitSU) continue; MachineInstr *DefInstr = DefSU->getInstr(); - if (DefSU != SU && (Kind != SDep::Output || !MO.isDead() || - !DefInstr->registerDefIsDead(*Alias))) { - SDep Dep(SU, Kind, /*Reg=*/*Alias); + MachineOperand &DefMO = DefInstr->getOperand(I->OpIdx); + if (DefSU != SU && + (Kind != SDep::Output || !MO.isDead() || !DefMO.isDead())) { + SDep Dep(SU, Kind, DefMO.getReg()); if (Kind != SDep::Anti) { Dep.setLatency( SchedModel.computeOutputLatency(MI, OperIdx, DefInstr)); @@ -324,37 +332,42 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { // Either insert a new Reg2SUnits entry with an empty SUnits list, or // retrieve the existing SUnits list for this register's uses. // Push this SUnit on the use list. - Uses.insert(PhysRegSUOper(SU, OperIdx, Reg)); + for (MCRegUnit Unit : TRI->regunits(Reg)) + Uses.insert(PhysRegSUOper(SU, OperIdx, Unit)); if (RemoveKillFlags) MO.setIsKill(false); } else { addPhysRegDataDeps(SU, OperIdx); // Clear previous uses and defs of this register and its subregisters. - for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) { - Uses.eraseAll(SubReg); + for (MCRegUnit Unit : TRI->regunits(Reg)) { + Uses.eraseAll(Unit); if (!MO.isDead()) - Defs.eraseAll(SubReg); + Defs.eraseAll(Unit); } + if (MO.isDead() && SU->isCall) { // Calls will not be reordered because of chain dependencies (see // below). Since call operands are dead, calls may continue to be added // to the DefList making dependence checking quadratic in the size of // the block. Instead, we leave only one call at the back of the // DefList. - Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg); - Reg2SUnitsMap::iterator B = P.first; - Reg2SUnitsMap::iterator I = P.second; - for (bool isBegin = I == B; !isBegin; /* empty */) { - isBegin = (--I) == B; - if (!I->SU->isCall) - break; - I = Defs.erase(I); + for (MCRegUnit Unit : TRI->regunits(Reg)) { + Reg2SUnitsMap::RangePair P = Defs.equal_range(Unit); + Reg2SUnitsMap::iterator B = P.first; + Reg2SUnitsMap::iterator I = P.second; + for (bool isBegin = I == B; !isBegin; /* empty */) { + isBegin = (--I) == B; + if (!I->SU->isCall) + break; + I = Defs.erase(I); + } } } // Defs are pushed in the order they are visited and never reordered. - Defs.insert(PhysRegSUOper(SU, OperIdx, Reg)); + for (MCRegUnit Unit : TRI->regunits(Reg)) + Defs.insert(PhysRegSUOper(SU, OperIdx, Unit)); } } diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 165eeb0..667c561 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1363,11 +1363,11 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_movk_i32 s0, 0x7fff ; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_movk_i32 s0, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v2, s0, v3, v2 ; VI-NEXT: flat_store_short v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 8a7cdf3..f957e03 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3788,13 +3788,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir index 395a4f8..a6ff60a 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir @@ -4,15 +4,11 @@ # CHECK: SU(0): $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: Successors: # CHECK-NEXT: SU(2): Out Latency=1 -# CHECK-NEXT: SU(4): Out Latency=1 # CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0 -# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1 # CHECK: SU(1): $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: Successors: # CHECK-NEXT: SU(3): Out Latency=1 -# CHECK-NEXT: SU(4): Out Latency=1 # CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr1 -# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1 # CHECK: SU(2): $vgpr0 = V_ADD_CO_U32_e32 $sgpr2, $vgpr0, implicit-def $vcc, implicit $exec # CHECK: Predecessors: # CHECK-NEXT: SU(0): Out Latency=1 @@ -22,7 +18,6 @@ # CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1 # CHECK-NEXT: SU(3): Out Latency=1 # CHECK-NEXT: SU(3): Data Latency=1 Reg=$vcc -# CHECK-NEXT: SU(4): Anti Latency=0 # CHECK: SU(3): $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def dead $vcc, implicit $vcc, implicit $exec # CHECK: Predecessors: # CHECK-NEXT: SU(2): Out Latency=1 @@ -32,19 +27,12 @@ # CHECK: Successors: # CHECK-NEXT: SU(4): Out Latency=1 # CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1 -# CHECK-NEXT: SU(4): Anti Latency=0 # CHECK: SU(4): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr # CHECK: Predecessors: # CHECK-NEXT: SU(3): Out Latency=1 # CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1 -# CHECK-NEXT: SU(3): Anti Latency=0 # CHECK-NEXT: SU(2): Out Latency=1 # CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0_vgpr1 -# CHECK-NEXT: SU(2): Anti Latency=0 -# CHECK-NEXT: SU(1): Out Latency=1 -# CHECK-NEXT: SU(1): Data Latency=1 Reg=$vgpr0_vgpr1 -# CHECK-NEXT: SU(0): Out Latency=1 -# CHECK-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1 # CHECK: Successors: # CHECK-NEXT: ExitSU: Ord Latency=3 Artificial diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll index f927bdd..3cbf3d2 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll @@ -35,8 +35,8 @@ define void @__int128_and_f(ptr noalias nocapture writeonly sret(i128) align 8 % ; Z15-LABEL: __int128_and_f: ; Z15: # %bb.0: # %entry ; Z15-NEXT: vl %v0, 0(%r3), 3 -; Z15-NEXT: vrepg %v6, %v0, 1 ; Z15-NEXT: vlr %v4, %v0 +; Z15-NEXT: vrepg %v6, %v0, 1 ; Z15-NEXT: #APP ; Z15-NEXT: #NO_APP ; Z15-NEXT: vmrhg %v0, %v4, %v6 @@ -260,8 +260,8 @@ entry: define <4 x i32> @vec128_and_f(<4 x i32> %cc_dep1) { ; CHECK-LABEL: vec128_and_f: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepg %v3, %v24, 1 ; CHECK-NEXT: vlr %v1, %v24 +; CHECK-NEXT: vrepg %v3, %v24, 1 ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmrhg %v24, %v1, %v3 diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll index b953fc3..23d78a9 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll @@ -249,8 +249,8 @@ entry: define <4 x i32> @vec128_and_f(<4 x i32> %cc_dep1) { ; CHECK-LABEL: vec128_and_f: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepg %v2, %v24, 1 ; CHECK-NEXT: vlr %v0, %v24 +; CHECK-NEXT: vrepg %v2, %v24, 1 ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmrhg %v24, %v0, %v2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll index 2e75ce9..219541c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -47,8 +47,8 @@ define void @vldst4(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRo ; CHECK-NEXT: vmovx.f16 s8, s13 ; CHECK-NEXT: vins.f16 s20, s22 ; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vins.f16 s25, s6 ; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vins.f16 s25, s6 ; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vins.f16 s0, s15 ; CHECK-NEXT: vmovx.f16 s9, s21 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index 3f92152e1..7d662d1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1358,8 +1358,8 @@ define void @vst3_v16f16(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vins.f16 s16, s24 ; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vins.f16 s16, s24 ; CHECK-NEXT: vmov.16 q4[4], r2 ; CHECK-NEXT: vins.f16 s0, s25 ; CHECK-NEXT: vmov.f32 s19, s0 |