diff options
author | Austin Kerbow <Austin.Kerbow@amd.com> | 2025-02-25 21:46:05 -0800 |
---|---|---|
committer | Austin Kerbow <Austin.Kerbow@amd.com> | 2025-02-26 09:03:57 -0800 |
commit | d4c4f79a4ade5a49ab7ffbfb067c1d8bf4515add (patch) | |
tree | d5851e264a8ccd50fff59b801eda69117486977a | |
parent | fe6782da986b184d334b222a2adfefa4aa4b4bb4 (diff) | |
download | llvm-users/kerbowa/amdgpu-load-lat-scale.zip llvm-users/kerbowa/amdgpu-load-lat-scale.tar.gz llvm-users/kerbowa/amdgpu-load-lat-scale.tar.bz2 |
[AMDGPU] Dynamically set load latency in the schedulerusers/kerbowa/amdgpu-load-lat-scale
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 26 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 12 |
4 files changed, 60 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 176586e..faaa8b5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1130,6 +1130,32 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { GCNSchedStage::finalizeGCNSchedStage(); } +bool ILPInitialScheduleStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor(); + const unsigned ILPLoadLatencyScaleFactorDefault = 300; + if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor()) + TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + + LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to " + << TII->getLoadLatencyScaleFactor() << '\n'); + return true; +} + +void ILPInitialScheduleStage::finalizeGCNSchedStage() { + const SIInstrInfo *TII = ST.getInstrInfo(); + TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor); + + LLVM_DEBUG( + dbgs() << "ILP Initial Schedule: Restored load latency scale factor to " + << OriginalLoadLatencyScaleFactor << "\n"); + + GCNSchedStage::finalizeGCNSchedStage(); +} + bool GCNSchedStage::initGCNRegion() { // Check whether this new region is also a new block. if (DAG.RegionBegin->getParent() != CurrentMBB) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index e3da8d3..b2e8cda 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -474,8 +474,13 @@ public: }; class ILPInitialScheduleStage : public GCNSchedStage { +private: + unsigned OriginalLoadLatencyScaleFactor = 0; + public: bool shouldRevertScheduling(unsigned WavesAfter) override; + bool initGCNSchedStage() override; + void finalizeGCNSchedStage() override; ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d5d5433..502c8e1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -60,9 +60,15 @@ static cl::opt<bool> Fix16BitCopies( cl::init(true), cl::ReallyHidden); +static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor( + "amdgpu-load-latency-scale-factor", + cl::desc("Scale factor for load instruction latency. Final latency is " + "scalled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), - RI(ST), ST(ST) { + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor) { SchedModel.init(&ST); } @@ -9792,6 +9798,15 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return SchedModel.computeInstrLatency(&MI); } +unsigned SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel, + const MachineInstr &MI) const { + unsigned Latency = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI); + if (MI.mayLoad()) + Latency *= LoadLatencyScaleFactor / 100; + + return Latency; +} + InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { unsigned opcode = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 428322a..ef8550cb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -88,6 +88,9 @@ private: const GCNSubtarget &ST; TargetSchedModel SchedModel; mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter; + // Final load latency in the machine model is scalled by + // `Factor / 100 * Latency` + mutable unsigned LoadLatencyScaleFactor = 100; // The inverse predicate should have the negative value. enum BranchPredicate { @@ -106,6 +109,12 @@ private: static BranchPredicate getBranchPredicate(unsigned Opcode); public: + void setLoadLatencyScaleFactor(unsigned Factor) const { + LoadLatencyScaleFactor = Factor; + } + + unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; } + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, @@ -1462,6 +1471,9 @@ public: const MachineInstr &MI, unsigned *PredCost = nullptr) const override; + unsigned getInstrLatency(const TargetSchedModel &TargetSchedModel, + const MachineInstr &MI) const override; + InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final; |