aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAustin Kerbow <Austin.Kerbow@amd.com>2025-02-25 21:46:05 -0800
committerAustin Kerbow <Austin.Kerbow@amd.com>2025-02-26 09:03:57 -0800
commitd4c4f79a4ade5a49ab7ffbfb067c1d8bf4515add (patch)
treed5851e264a8ccd50fff59b801eda69117486977a
parentfe6782da986b184d334b222a2adfefa4aa4b4bb4 (diff)
downloadllvm-users/kerbowa/amdgpu-load-lat-scale.zip
llvm-users/kerbowa/amdgpu-load-lat-scale.tar.gz
llvm-users/kerbowa/amdgpu-load-lat-scale.tar.bz2
[AMDGPU] Dynamically set load latency in the schedulerusers/kerbowa/amdgpu-load-lat-scale
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h12
4 files changed, 60 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 176586e..faaa8b5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1130,6 +1130,32 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
GCNSchedStage::finalizeGCNSchedStage();
}
+bool ILPInitialScheduleStage::initGCNSchedStage() {
+ if (!GCNSchedStage::initGCNSchedStage())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor();
+ const unsigned ILPLoadLatencyScaleFactorDefault = 300;
+ if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor())
+ TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
+
+ LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to "
+ << TII->getLoadLatencyScaleFactor() << '\n');
+ return true;
+}
+
+void ILPInitialScheduleStage::finalizeGCNSchedStage() {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor);
+
+ LLVM_DEBUG(
+ dbgs() << "ILP Initial Schedule: Restored load latency scale factor to "
+ << OriginalLoadLatencyScaleFactor << "\n");
+
+ GCNSchedStage::finalizeGCNSchedStage();
+}
+
bool GCNSchedStage::initGCNRegion() {
// Check whether this new region is also a new block.
if (DAG.RegionBegin->getParent() != CurrentMBB)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e3da8d3..b2e8cda 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -474,8 +474,13 @@ public:
};
class ILPInitialScheduleStage : public GCNSchedStage {
+private:
+ unsigned OriginalLoadLatencyScaleFactor = 0;
+
public:
bool shouldRevertScheduling(unsigned WavesAfter) override;
+ bool initGCNSchedStage() override;
+ void finalizeGCNSchedStage() override;
ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d5d5433..502c8e1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -60,9 +60,15 @@ static cl::opt<bool> Fix16BitCopies(
cl::init(true),
cl::ReallyHidden);
+static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor(
+ "amdgpu-load-latency-scale-factor",
+ cl::desc("Scale factor for load instruction latency. Final latency is "
+ "scalled by `Factor / 100 * Latency`."),
+ cl::init(100), cl::ReallyHidden);
+
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
- : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
- RI(ST), ST(ST) {
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor) {
SchedModel.init(&ST);
}
@@ -9792,6 +9798,15 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return SchedModel.computeInstrLatency(&MI);
}
+unsigned SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel,
+ const MachineInstr &MI) const {
+ unsigned Latency = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI);
+ if (MI.mayLoad())
+ Latency *= LoadLatencyScaleFactor / 100;
+
+ return Latency;
+}
+
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 428322a..ef8550cb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -88,6 +88,9 @@ private:
const GCNSubtarget &ST;
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
+ // Final load latency in the machine model is scalled by
+ // `Factor / 100 * Latency`
+ mutable unsigned LoadLatencyScaleFactor = 100;
// The inverse predicate should have the negative value.
enum BranchPredicate {
@@ -106,6 +109,12 @@ private:
static BranchPredicate getBranchPredicate(unsigned Opcode);
public:
+ void setLoadLatencyScaleFactor(unsigned Factor) const {
+ LoadLatencyScaleFactor = Factor;
+ }
+
+ unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; }
+
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI,
const MachineOperand &SuperReg,
@@ -1462,6 +1471,9 @@ public:
const MachineInstr &MI,
unsigned *PredCost = nullptr) const override;
+ unsigned getInstrLatency(const TargetSchedModel &TargetSchedModel,
+ const MachineInstr &MI) const override;
+
InstructionUniformity
getInstructionUniformity(const MachineInstr &MI) const override final;