diff options
author | Valery Pykhtin <valery.pykhtin@amd.com> | 2025-10-13 16:17:11 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-10-13 16:17:11 +0200 |
commit | 8823efe77dad40eaea63b539c4d3d1036587ceb0 (patch) | |
tree | a4282f10d23b679dbe43558f0406db5399935d5b /llvm/lib/Target | |
parent | a80a6b31a721c8d73445c458f1ea2312d9262750 (diff) | |
download | llvm-8823efe77dad40eaea63b539c4d3d1036587ceb0.zip llvm-8823efe77dad40eaea63b539c4d3d1036587ceb0.tar.gz llvm-8823efe77dad40eaea63b539c4d3d1036587ceb0.tar.bz2 |
[AMDGPU] Add register usage debug printing the point of maximum register pressure. (#161850)
Basically this allows to analyze "why so many VGPRs used?".
It prints all live registers at the point of maximum register pressure
and for each register its defs/uses are dumped.
Currently can be run before and after the scheduler but would be nice if
it can be ran inbetween any passes (not sure this is possible with
legacy pass-manager). Requires debug or built with asserts compiler.
Highly recommended to run with debug info to have debug locations for
instructions.
Example output:
```
*** Register pressure info (VGPRs) for _ZN7ck_tile6ken.... ***
Max pressure is 256 VGPRs at 41780e@BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10137:vreg_128_align2, %10141:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec
Live registers with single definition (123 VGPRs):
%10126:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs)
def 41600r@BB.18 (LoopHdr BB.16, Depth 1): undef %10126.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 15232, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1314, !noalias !60, addrspace 3)
def 41608r@BB.18 (LoopHdr BB.16, Depth 1): %10126.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 16320, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1315, !noalias !60, addrspace 3)
use 41848r@BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10126:vreg_128_align2, %10138:vreg_128_align2, %9856:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec
%10136:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs)
def 41264r@BB.18 (LoopHdr BB.16, Depth 1): undef %10136.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 2176, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1294, !noalias !60, addrspace 3)
def 41272r@BB.18 (LoopHdr BB.16, Depth 1): %10136.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 3264, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1295, !noalias !60, addrspace 3)
use 41788r@BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10136:vreg_128_align2, %10140:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec
%10129:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs)
...
Live registers with multiple definitions (133 VGPRs):
%9856:VReg_512_Align2, LiveMask 00000000FFFFFFFF (16 VGPRs)
def 16544r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def undef %9856.sub0_sub1:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16592r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub2_sub3:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16608r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub4_sub5:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16656r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub6_sub7:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16672r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub8_sub9:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16720r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub10_sub11:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16736r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub12_sub13:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16784r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub14_sub15:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def use 41828r@BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_...
...
********** INTERVALS **********
...
********** MACHINEINSTRS **********
# Machine code for function _ZN7ck_tile6kentr...
```
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 132 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.h | 31 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 31 |
3 files changed, 188 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 71494be..4e11c4f 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -14,6 +14,7 @@ #include "GCNRegPressure.h" #include "AMDGPU.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterPressure.h" using namespace llvm; @@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI, + GCNRegPressure::RegKind RegKind) { GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { auto Reg = Register::index2VirtReg(I); + if (RegKind != GCNRegPressure::TOTAL_KINDS && + GCNRegPressure::getRegKind(Reg, MRI) != RegKind) + continue; if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); @@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { #undef PFX } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF, + GCNRegPressure::RegKind Kind, + LiveIntervals &LIS, + const MachineLoopInfo *MLI) { + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + auto &OS = dbgs(); + const char *RegName = GCNRegPressure::getName(Kind); + + unsigned MaxNumRegs = 0; + const MachineInstr *MaxPressureMI = nullptr; + GCNUpwardRPTracker RPT(LIS); + for (const MachineBasicBlock &MBB : MF) { + RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot()); + for (const MachineInstr &MI : reverse(MBB)) { + RPT.recede(MI); + unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind); + if (NumRegs > MaxNumRegs) { + MaxNumRegs = NumRegs; + MaxPressureMI = &MI; + } + } + } + + SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI); + + // Max pressure can occur at either the early-clobber or register slot. + // Choose the maximum liveset between both slots. This is ugly but this is + // diagnostic code. + SlotIndex ECSlot = MISlot.getRegSlot(true); + SlotIndex RSlot = MISlot.getRegSlot(false); + GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind); + GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind); + unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind); + unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind); + GCNRPTracker::LiveRegSet *LiveSet = + ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet; + SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot; + assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs); + + // Split live registers into single-def and multi-def sets. + GCNRegPressure SDefPressure, MDefPressure; + SmallVector<Register, 16> SDefRegs, MDefRegs; + for (auto [Reg, LaneMask] : *LiveSet) { + assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind); + LiveInterval &LI = LIS.getInterval(Reg); + if (LI.getNumValNums() == 1 || + (LI.hasSubRanges() && + llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) { + return SR.getNumValNums() == 1; + }))) { + SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI); + SDefRegs.push_back(Reg); + } else { + MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI); + MDefRegs.push_back(Reg); + } + } + unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind); + unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind); + assert(SDefNumRegs + MDefNumRegs == MaxNumRegs); + + auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) { + return Printable([&, MBB, SI](raw_ostream &OS) { + OS << SI << ':' << printMBBReference(*MBB); + if (MLI) + if (const MachineLoop *ML = MLI->getLoopFor(MBB)) + OS << " (LoopHdr " << printMBBReference(*ML->getHeader()) + << ", Depth " << ML->getLoopDepth() << ")"; + }); + }; + + auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) { + GCNRegPressure RegPressure; + RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI); + OS << " " << printReg(Reg, TRI) << ':' + << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask " + << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' ' + << RegName << "s)\n"; + + // Use std::map to sort def/uses by SlotIndex. + std::map<SlotIndex, const MachineInstr *> Instrs; + for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) { + Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI; + } + + for (const auto &[SI, MI] : Instrs) { + OS << " "; + if (MI->definesRegister(Reg, TRI)) + OS << "def "; + if (MI->readsRegister(Reg, TRI)) + OS << "use "; + OS << printLoc(MI->getParent(), SI) << ": " << *MI; + } + }; + + OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName() + << " ***\n"; + OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at " + << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": " + << *MaxPressureMI; + + OS << "\nLive registers with single definition (" << SDefNumRegs << ' ' + << RegName << "s):\n"; + + // Sort SDefRegs by number of uses (smallest first) + llvm::sort(SDefRegs, [&](Register A, Register B) { + return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) < + std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end()); + }); + + for (const Register Reg : SDefRegs) { + PrintRegInfo(Reg, LiveSet->lookup(Reg)); + } + + OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' ' + << RegName << "s):\n"; + for (const Register Reg : MDefRegs) { + PrintRegInfo(Reg, LiveSet->lookup(Reg)); + } +} +#endif diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 898d1ff..979a8b0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -31,6 +31,12 @@ class SlotIndex; struct GCNRegPressure { enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; + static constexpr const char *getName(RegKind Kind) { + const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"}; + assert(Kind < TOTAL_KINDS); + return Names[Kind]; + } + GCNRegPressure() { clear(); } @@ -41,6 +47,11 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + unsigned getNumRegs(RegKind Kind) const { + assert(Kind < TOTAL_KINDS); + return Value[Kind]; + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure @@ -138,6 +149,12 @@ struct GCNRegPressure { void dump() const; + static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI); + return (RegKind)getRegKind(MRI.getRegClass(Reg), STI); + } + private: static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2; @@ -294,8 +311,10 @@ public: } }; -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI); +GCNRPTracker::LiveRegSet +getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, + GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS); //////////////////////////////////////////////////////////////////////////////// // GCNUpwardRPTracker @@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, const MachineRegisterInfo &MRI, LaneBitmask LaneMaskFilter = LaneBitmask::getAll()); -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI); - /// creates a map MachineInstr -> LiveRegSet /// R - range of iterators on instructions /// After - upon entry or exit of every instruction @@ -524,6 +540,11 @@ public: } }; +LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, + GCNRegPressure::RegKind Kind, + LiveIntervals &LIS, + const MachineLoopInfo *MLI); + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index bdc0810..58482ea 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +#define DUMP_MAX_REG_PRESSURE +static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( + "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, + cl::desc("Print a list of live registers along with their def/uses at the " + "point of maximum register pressure before scheduling."), + cl::init(false)); + +static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler( + "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, + cl::desc("Print a list of live registers along with their def/uses at the " + "point of maximum register pressure after scheduling."), + cl::init(false)); +#endif + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() { RegionLiveOuts.buildLiveRegMap(); } +#ifdef DUMP_MAX_REG_PRESSURE + if (PrintMaxRPRegUsageBeforeScheduler) { + dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); + dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); + LIS->dump(); + } +#endif + GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); while (S.advanceStage()) { auto Stage = createSchedStage(S.getCurrentStage()); @@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() { Stage->finalizeGCNSchedStage(); } + +#ifdef DUMP_MAX_REG_PRESSURE + if (PrintMaxRPRegUsageAfterScheduler) { + dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); + dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); + LIS->dump(); + } +#endif } #ifndef NDEBUG |