diff options
author | Jay Foad <jay.foad@amd.com> | 2023-07-28 11:12:59 +0100 |
---|---|---|
committer | Jay Foad <jay.foad@amd.com> | 2023-08-07 15:41:40 +0100 |
commit | 56d92c17583e5f0b5e1e521b5f614be79436fccc (patch) | |
tree | a01b5cd27b41175516118c52e9708defd9fe9dda /llvm/lib | |
parent | 97324f6274184e607fa6d6cffb1aebee317d4644 (diff) | |
download | llvm-56d92c17583e5f0b5e1e521b5f614be79436fccc.zip llvm-56d92c17583e5f0b5e1e521b5f614be79436fccc.tar.gz llvm-56d92c17583e5f0b5e1e521b5f614be79436fccc.tar.bz2 |
[MachineScheduler] Track physical register dependencies per-regunit
Change the scheduler's physical register dependency tracking from
registers-and-their-aliases to regunits. This has a couple of advantages
when subregisters are used:
- The dependency tracking is more accurate and creates fewer useless
edges in the dependency graph. An AMDGPU example, edited for clarity:
SU(0): $vgpr1 = V_MOV_B32 $sgpr0
SU(1): $vgpr1 = V_ADDC_U32 0, $vgpr1
SU(2): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0
There is a data dependency on $vgpr1 from SU(0) to SU(1) and from
SU(1) to SU(2). But the old dependency tracking code also added a
useless edge from SU(0) to SU(2) because it thought that SU(0)'s def
of $vgpr1 aliased with SU(2)'s use of $vgpr0_vgpr1.
- On targets like AMDGPU that make heavy use of subregisters, each
register can have a huge number of aliases - it can be quadratic in
the size of the largest defined register tuple. There is a much lower
bound on the number of regunits per register, so iterating over
regunits is faster than iterating over aliases.
The LLVM compile-time tracker shows a tiny overall improvement of 0.03%
on X86. I expect a larger compile-time improvement on targets like
AMDGPU.
Recommit after fixing AggressiveAntiDepBreaker in D156880.
Differential Revision: https://reviews.llvm.org/D156552
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 65 |
1 files changed, 39 insertions, 26 deletions
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 37a1ef0..a42f842 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -211,7 +211,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { for (const MachineOperand &MO : ExitMI->all_uses()) { Register Reg = MO.getReg(); if (Reg.isPhysical()) { - Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); + for (MCRegUnit Unit : TRI->regunits(Reg)) + Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit)); } else if (Reg.isVirtual() && MO.readsReg()) { addVRegUseDeps(&ExitSU, MO.getOperandNo()); } @@ -222,8 +223,11 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { // uses all the registers that are livein to the successor blocks. for (const MachineBasicBlock *Succ : BB->successors()) { for (const auto &LI : Succ->liveins()) { - if (!Uses.contains(LI.PhysReg)) - Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg)); + // TODO: Use LI.LaneMask to refine this. + for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) { + if (!Uses.contains(Unit)) + Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit)); + } } } } @@ -244,8 +248,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc(); bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() && !DefMIDesc.hasImplicitDefOfPhysReg(Reg)); - for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { - for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) { + for (MCRegUnit Unit : TRI->regunits(Reg)) { + for (Reg2SUnitsMap::iterator I = Uses.find(Unit); I != Uses.end(); ++I) { SUnit *UseSU = I->SU; if (UseSU == SU) continue; @@ -262,11 +266,14 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { // Set the hasPhysRegDefs only for physreg defs that have a use within // the scheduling region. SU->hasPhysRegDefs = true; + UseInstr = UseSU->getInstr(); + Register UseReg = UseInstr->getOperand(UseOpIdx).getReg(); const MCInstrDesc &UseMIDesc = UseInstr->getDesc(); - ImplicitPseudoUse = (UseOpIdx >= ((int)UseMIDesc.getNumOperands()) && - !UseMIDesc.hasImplicitUseOfPhysReg(*Alias)); - Dep = SDep(SU, SDep::Data, *Alias); + ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) && + !UseMIDesc.hasImplicitUseOfPhysReg(UseReg); + + Dep = SDep(SU, SDep::Data, UseReg); } if (!ImplicitPseudoDef && !ImplicitPseudoUse) { Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, @@ -300,15 +307,16 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { // TODO: Using a latency of 1 here for output dependencies assumes // there's no cost for reusing registers. SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; - for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { - for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) { + for (MCRegUnit Unit : TRI->regunits(Reg)) { + for (Reg2SUnitsMap::iterator I = Defs.find(Unit); I != Defs.end(); ++I) { SUnit *DefSU = I->SU; if (DefSU == &ExitSU) continue; MachineInstr *DefInstr = DefSU->getInstr(); - if (DefSU != SU && (Kind != SDep::Output || !MO.isDead() || - !DefInstr->registerDefIsDead(*Alias))) { - SDep Dep(SU, Kind, /*Reg=*/*Alias); + MachineOperand &DefMO = DefInstr->getOperand(I->OpIdx); + if (DefSU != SU && + (Kind != SDep::Output || !MO.isDead() || !DefMO.isDead())) { + SDep Dep(SU, Kind, DefMO.getReg()); if (Kind != SDep::Anti) { Dep.setLatency( SchedModel.computeOutputLatency(MI, OperIdx, DefInstr)); @@ -324,37 +332,42 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { // Either insert a new Reg2SUnits entry with an empty SUnits list, or // retrieve the existing SUnits list for this register's uses. // Push this SUnit on the use list. - Uses.insert(PhysRegSUOper(SU, OperIdx, Reg)); + for (MCRegUnit Unit : TRI->regunits(Reg)) + Uses.insert(PhysRegSUOper(SU, OperIdx, Unit)); if (RemoveKillFlags) MO.setIsKill(false); } else { addPhysRegDataDeps(SU, OperIdx); // Clear previous uses and defs of this register and its subregisters. - for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) { - Uses.eraseAll(SubReg); + for (MCRegUnit Unit : TRI->regunits(Reg)) { + Uses.eraseAll(Unit); if (!MO.isDead()) - Defs.eraseAll(SubReg); + Defs.eraseAll(Unit); } + if (MO.isDead() && SU->isCall) { // Calls will not be reordered because of chain dependencies (see // below). Since call operands are dead, calls may continue to be added // to the DefList making dependence checking quadratic in the size of // the block. Instead, we leave only one call at the back of the // DefList. - Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg); - Reg2SUnitsMap::iterator B = P.first; - Reg2SUnitsMap::iterator I = P.second; - for (bool isBegin = I == B; !isBegin; /* empty */) { - isBegin = (--I) == B; - if (!I->SU->isCall) - break; - I = Defs.erase(I); + for (MCRegUnit Unit : TRI->regunits(Reg)) { + Reg2SUnitsMap::RangePair P = Defs.equal_range(Unit); + Reg2SUnitsMap::iterator B = P.first; + Reg2SUnitsMap::iterator I = P.second; + for (bool isBegin = I == B; !isBegin; /* empty */) { + isBegin = (--I) == B; + if (!I->SU->isCall) + break; + I = Defs.erase(I); + } } } // Defs are pushed in the order they are visited and never reordered. - Defs.insert(PhysRegSUOper(SU, OperIdx, Reg)); + for (MCRegUnit Unit : TRI->regunits(Reg)) + Defs.insert(PhysRegSUOper(SU, OperIdx, Unit)); } } |