aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAustin Kerbow <Austin.Kerbow@amd.com>2025-02-25 21:46:05 -0800
committerAustin Kerbow <Austin.Kerbow@amd.com>2025-09-15 22:51:53 -0700
commitd57286a6889d75d7dbc2cfebb55b45ee4268f018 (patch)
tree8902c7ed3c8c90f09ea0fd18deb2754b4be400d4
parent450737fba25203c8dfeca036925e9af9d6c22f0a (diff)
downloadllvm-users/kerbowa/amdgpu-load-lat-scale.zip
llvm-users/kerbowa/amdgpu-load-lat-scale.tar.gz
llvm-users/kerbowa/amdgpu-load-lat-scale.tar.bz2
[AMDGPU] Dynamically set load latency in the schedulerusers/kerbowa/amdgpu-load-lat-scale
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h7
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h39
-rw-r--r--llvm/test/CodeGen/AMDGPU/res793
5 files changed, 931 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 254b75b..ae553da 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1150,6 +1150,40 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
GCNSchedStage::finalizeGCNSchedStage();
}
+bool ILPInitialScheduleStage::initGCNSchedStage() {
+ if (!GCNSchedStage::initGCNSchedStage())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor();
+ OriginalDSReadLatencyScaleFactor = TII->getDSReadLatencyScaleFactor();
+ OriginalVMEMLoadLatencyScaleFactor = TII->getVMEMLoadLatencyScaleFactor();
+ const unsigned ILPLoadLatencyScaleFactorDefault = 300;
+ if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor())
+ TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
+ if (ILPLoadLatencyScaleFactorDefault > TII->getDSReadLatencyScaleFactor())
+ TII->setDSReadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
+ if (ILPLoadLatencyScaleFactorDefault > TII->getVMEMLoadLatencyScaleFactor())
+ TII->setVMEMLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
+
+ LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to "
+ << TII->getLoadLatencyScaleFactor() << '\n');
+ return true;
+}
+
+void ILPInitialScheduleStage::finalizeGCNSchedStage() {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor);
+ TII->setDSReadLatencyScaleFactor(OriginalDSReadLatencyScaleFactor);
+ TII->setVMEMLoadLatencyScaleFactor(OriginalVMEMLoadLatencyScaleFactor);
+
+ LLVM_DEBUG(
+ dbgs() << "ILP Initial Schedule: Restored load latency scale factor to "
+ << OriginalLoadLatencyScaleFactor << "\n");
+
+ GCNSchedStage::finalizeGCNSchedStage();
+}
+
bool GCNSchedStage::initGCNRegion() {
// Check whether this new region is also a new block.
if (DAG.RegionBegin->getParent() != CurrentMBB)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 790370f..5be6d4b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -515,8 +515,15 @@ public:
};
class ILPInitialScheduleStage : public GCNSchedStage {
+private:
+ unsigned OriginalLoadLatencyScaleFactor = 0;
+ unsigned OriginalDSReadLatencyScaleFactor = 0;
+ unsigned OriginalVMEMLoadLatencyScaleFactor = 0;
+
public:
bool shouldRevertScheduling(unsigned WavesAfter) override;
+ bool initGCNSchedStage() override;
+ void finalizeGCNSchedStage() override;
ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5106478..a35aabd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -62,9 +62,29 @@ static cl::opt<bool> Fix16BitCopies(
cl::init(true),
cl::ReallyHidden);
+static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor(
+ "amdgpu-load-latency-scale-factor",
+ cl::desc("Scale factor for load instruction latency. Final latency is "
+ "scalled by `Factor / 100 * Latency`."),
+ cl::init(100), cl::ReallyHidden);
+
+static cl::opt<unsigned> AMDGPUDSReadLatencyScaleFactor(
+ "amdgpu-ds-read-latency-scale-factor",
+ cl::desc("Scale factor for LDS (DS) read instruction latency. Final "
+ "latency is scaled by `Factor / 100 * Latency`."),
+ cl::init(100), cl::ReallyHidden);
+
+static cl::opt<unsigned> AMDGPUVMEMLoadLatencyScaleFactor(
+ "amdgpu-vmem-load-latency-scale-factor",
+ cl::desc("Scale factor for VMEM/BUFFER/FLAT load instruction latency. "
+ "Final latency is scaled by `Factor / 100 * Latency`."),
+ cl::init(100), cl::ReallyHidden);
+
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
- RI(ST), ST(ST) {
+ RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor),
+ DSReadLatencyScaleFactor(AMDGPUDSReadLatencyScaleFactor),
+ VMEMLoadLatencyScaleFactor(AMDGPUVMEMLoadLatencyScaleFactor) {
SchedModel.init(&ST);
}
@@ -10240,6 +10260,43 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return SchedModel.computeInstrLatency(&MI);
}
+std::optional<unsigned>
+SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel,
+ const MachineInstr &MI) const {
+ auto LatencyOpt = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI);
+ if (!LatencyOpt)
+ return std::nullopt;
+ unsigned Latency = *LatencyOpt;
+ if (MI.mayLoad()) {
+ unsigned Scale = LoadLatencyScaleFactor;
+ if (isDS(MI))
+ Scale = DSReadLatencyScaleFactor;
+ else if (isVMEM(MI) || isFLAT(MI))
+ Scale = VMEMLoadLatencyScaleFactor;
+ Latency = (Latency * Scale) / 100;
+ }
+ return Latency;
+}
+
+std::optional<unsigned> SIInstrInfo::getOperandLatency(
+ const TargetSchedModel &SchedModel, const MachineInstr *DefMI,
+ unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const {
+ auto LatOpt = TargetInstrInfo::getOperandLatency(
+ SchedModel, DefMI, DefOperIdx, UseMI, UseOperIdx);
+ if (!LatOpt)
+ return std::nullopt;
+ unsigned Latency = *LatOpt;
+ if (DefMI && DefMI->mayLoad()) {
+ unsigned Scale = LoadLatencyScaleFactor;
+ if (isDS(*DefMI))
+ Scale = DSReadLatencyScaleFactor;
+ else if (isVMEM(*DefMI) || isFLAT(*DefMI))
+ Scale = VMEMLoadLatencyScaleFactor;
+ Latency = (Latency * Scale) / 100;
+ }
+ return Latency;
+}
+
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dffb3d7..e01e303 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -93,6 +93,13 @@ private:
const GCNSubtarget &ST;
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
+ // Final load latency in the machine model is scalled by
+ // `Factor / 100 * Latency`
+ mutable unsigned LoadLatencyScaleFactor = 100;
+ // Separate scale factor for LDS (DS) read operations.
+ mutable unsigned DSReadLatencyScaleFactor = 100;
+ // Separate scale factor for VMEM/BUFFER/FLAT loads.
+ mutable unsigned VMEMLoadLatencyScaleFactor = 100;
// The inverse predicate should have the negative value.
enum BranchPredicate {
@@ -111,6 +118,38 @@ private:
static BranchPredicate getBranchPredicate(unsigned Opcode);
public:
+ void setLoadLatencyScaleFactor(unsigned Factor) const {
+ LoadLatencyScaleFactor = Factor;
+ }
+
+ unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; }
+
+ // Control DS read (LDS) latency scaling independently when desired.
+ void setDSReadLatencyScaleFactor(unsigned Factor) const {
+ DSReadLatencyScaleFactor = Factor;
+ }
+ unsigned getDSReadLatencyScaleFactor() const {
+ return DSReadLatencyScaleFactor;
+ }
+
+ // Control VMEM/BUFFER/FLAT load latency scaling independently.
+ void setVMEMLoadLatencyScaleFactor(unsigned Factor) const {
+ VMEMLoadLatencyScaleFactor = Factor;
+ }
+ unsigned getVMEMLoadLatencyScaleFactor() const {
+ return VMEMLoadLatencyScaleFactor;
+ }
+
+ // TargetSchedModel latency hooks.
+ std::optional<unsigned>
+ getInstrLatency(const TargetSchedModel &TargetSchedModel,
+ const MachineInstr &MI) const override;
+ std::optional<unsigned> getOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineInstr *DefMI,
+ unsigned DefIdx,
+ const MachineInstr *UseMI,
+ unsigned UseIdx) const override;
+
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI,
const MachineOperand &SuperReg,
diff --git a/llvm/test/CodeGen/AMDGPU/res b/llvm/test/CodeGen/AMDGPU/res
new file mode 100644
index 0000000..a190b03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/res
@@ -0,0 +1,793 @@
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468864
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl s_add_i32 ; -- Begin function s_add_i32
+ .p2align 8
+ .type s_add_i32,@function
+s_add_i32: ; @s_add_i32
+; %bb.0:
+ s_load_dwordx4 s[0:3], s[4:5], 0x24
+ v_mov_b32_e32 v0, 0
+ s_waitcnt lgkmcnt(0)
+ s_load_dwordx2 s[4:5], s[2:3], 0x0
+ s_waitcnt lgkmcnt(0)
+ s_add_i32 s2, s4, s5
+ v_mov_b32_e32 v1, s2
+ global_store_dword v0, v1, s[0:1]
+ s_endpgm
+.Lfunc_end0:
+ .size s_add_i32, .Lfunc_end0-s_add_i32
+ ; -- End function
+ .set s_add_i32.num_vgpr, 2
+ .set s_add_i32.num_agpr, 0
+ .set s_add_i32.numbered_sgpr, 6
+ .set s_add_i32.num_named_barrier, 0
+ .set s_add_i32.private_seg_size, 0
+ .set s_add_i32.uses_vcc, 0
+ .set s_add_i32.uses_flat_scratch, 0
+ .set s_add_i32.has_dyn_sized_stack, 0
+ .set s_add_i32.has_recursion, 0
+ .set s_add_i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 48
+; TotalNumSgprs: 10
+; NumVgprs: 2
+; NumAgprs: 0
+; TotalNumVgprs: 2
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 10
+; NumVGPRsForWavesPerEU: 2
+; Occupancy: 10
+; WaveLimiterHint : 0
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468864
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl s_add_v2i32 ; -- Begin function s_add_v2i32
+ .p2align 8
+ .type s_add_v2i32,@function
+s_add_v2i32: ; @s_add_v2i32
+; %bb.0:
+ s_load_dwordx4 s[0:3], s[4:5], 0x24
+ v_mov_b32_e32 v2, 0
+ s_waitcnt lgkmcnt(0)
+ s_load_dwordx4 s[4:7], s[2:3], 0x0
+ s_waitcnt lgkmcnt(0)
+ s_add_i32 s2, s5, s7
+ s_add_i32 s3, s4, s6
+ v_mov_b32_e32 v0, s3
+ v_mov_b32_e32 v1, s2
+ global_store_dwordx2 v2, v[0:1], s[0:1]
+ s_endpgm
+.Lfunc_end1:
+ .size s_add_v2i32, .Lfunc_end1-s_add_v2i32
+ ; -- End function
+ .set s_add_v2i32.num_vgpr, 3
+ .set s_add_v2i32.num_agpr, 0
+ .set s_add_v2i32.numbered_sgpr, 8
+ .set s_add_v2i32.num_named_barrier, 0
+ .set s_add_v2i32.private_seg_size, 0
+ .set s_add_v2i32.uses_vcc, 0
+ .set s_add_v2i32.uses_flat_scratch, 0
+ .set s_add_v2i32.has_dyn_sized_stack, 0
+ .set s_add_v2i32.has_recursion, 0
+ .set s_add_v2i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 56
+; TotalNumSgprs: 12
+; NumVgprs: 3
+; NumAgprs: 0
+; TotalNumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 1
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 12
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468865
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl s_add_v4i32 ; -- Begin function s_add_v4i32
+ .p2align 8
+ .type s_add_v4i32,@function
+s_add_v4i32: ; @s_add_v4i32
+; %bb.0:
+ s_load_dwordx4 s[8:11], s[4:5], 0x24
+ v_mov_b32_e32 v4, 0
+ s_waitcnt lgkmcnt(0)
+ s_load_dwordx8 s[0:7], s[10:11], 0x0
+ s_waitcnt lgkmcnt(0)
+ s_add_i32 s3, s3, s7
+ s_add_i32 s2, s2, s6
+ s_add_i32 s1, s1, s5
+ s_add_i32 s0, s0, s4
+ v_mov_b32_e32 v0, s0
+ v_mov_b32_e32 v1, s1
+ v_mov_b32_e32 v2, s2
+ v_mov_b32_e32 v3, s3
+ global_store_dwordx4 v4, v[0:3], s[8:9]
+ s_endpgm
+.Lfunc_end2:
+ .size s_add_v4i32, .Lfunc_end2-s_add_v4i32
+ ; -- End function
+ .set s_add_v4i32.num_vgpr, 5
+ .set s_add_v4i32.num_agpr, 0
+ .set s_add_v4i32.numbered_sgpr, 12
+ .set s_add_v4i32.num_named_barrier, 0
+ .set s_add_v4i32.private_seg_size, 0
+ .set s_add_v4i32.uses_vcc, 0
+ .set s_add_v4i32.uses_flat_scratch, 0
+ .set s_add_v4i32.has_dyn_sized_stack, 0
+ .set s_add_v4i32.has_recursion, 0
+ .set s_add_v4i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 72
+; TotalNumSgprs: 16
+; NumVgprs: 5
+; NumAgprs: 0
+; TotalNumVgprs: 5
+; ScratchSize: 0
+; MemoryBound: 1
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 16
+; NumVGPRsForWavesPerEU: 5
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468993
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl s_add_v8i32 ; -- Begin function s_add_v8i32
+ .p2align 8
+ .type s_add_v8i32,@function
+s_add_v8i32: ; @s_add_v8i32
+; %bb.0: ; %entry
+ s_load_dwordx16 s[8:23], s[4:5], 0x44
+ s_load_dwordx2 s[0:1], s[4:5], 0x24
+ v_mov_b32_e32 v4, 0
+ s_waitcnt lgkmcnt(0)
+ s_add_i32 s4, s9, s17
+ s_add_i32 s5, s8, s16
+ s_add_i32 s6, s15, s23
+ s_add_i32 s7, s14, s22
+ s_add_i32 s8, s13, s21
+ s_add_i32 s9, s12, s20
+ s_add_i32 s2, s11, s19
+ s_add_i32 s3, s10, s18
+ v_mov_b32_e32 v0, s9
+ v_mov_b32_e32 v1, s8
+ v_mov_b32_e32 v2, s7
+ v_mov_b32_e32 v3, s6
+ global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+ s_nop 0
+ v_mov_b32_e32 v0, s5
+ v_mov_b32_e32 v1, s4
+ v_mov_b32_e32 v2, s3
+ v_mov_b32_e32 v3, s2
+ global_store_dwordx4 v4, v[0:3], s[0:1]
+ s_endpgm
+.Lfunc_end3:
+ .size s_add_v8i32, .Lfunc_end3-s_add_v8i32
+ ; -- End function
+ .set s_add_v8i32.num_vgpr, 5
+ .set s_add_v8i32.num_agpr, 0
+ .set s_add_v8i32.numbered_sgpr, 24
+ .set s_add_v8i32.num_named_barrier, 0
+ .set s_add_v8i32.private_seg_size, 0
+ .set s_add_v8i32.uses_vcc, 0
+ .set s_add_v8i32.uses_flat_scratch, 0
+ .set s_add_v8i32.has_dyn_sized_stack, 0
+ .set s_add_v8i32.has_recursion, 0
+ .set s_add_v8i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 112
+; TotalNumSgprs: 28
+; NumVgprs: 5
+; NumAgprs: 0
+; TotalNumVgprs: 5
+; ScratchSize: 0
+; MemoryBound: 1
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 3
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 28
+; NumVGPRsForWavesPerEU: 5
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11469185
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl s_add_v16i32 ; -- Begin function s_add_v16i32
+ .p2align 8
+ .type s_add_v16i32,@function
+s_add_v16i32: ; @s_add_v16i32
+; %bb.0: ; %entry
+ s_load_dwordx16 s[8:23], s[4:5], 0x64
+ s_load_dwordx16 s[36:51], s[4:5], 0xa4
+ s_load_dwordx2 s[0:1], s[4:5], 0x24
+ v_mov_b32_e32 v4, 0
+ s_waitcnt lgkmcnt(0)
+ s_add_i32 s4, s9, s37
+ s_add_i32 s5, s8, s36
+ s_add_i32 s6, s15, s43
+ s_add_i32 s7, s14, s42
+ s_add_i32 s8, s13, s41
+ s_add_i32 s9, s12, s40
+ s_add_i32 s12, s17, s45
+ s_add_i32 s13, s16, s44
+ s_add_i32 s14, s23, s51
+ s_add_i32 s15, s22, s50
+ s_add_i32 s16, s21, s49
+ s_add_i32 s17, s20, s48
+ s_add_i32 s2, s11, s39
+ s_add_i32 s3, s10, s38
+ s_add_i32 s10, s19, s47
+ s_add_i32 s11, s18, s46
+ v_mov_b32_e32 v0, s17
+ v_mov_b32_e32 v1, s16
+ v_mov_b32_e32 v2, s15
+ v_mov_b32_e32 v3, s14
+ global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+ s_nop 0
+ v_mov_b32_e32 v0, s13
+ v_mov_b32_e32 v1, s12
+ v_mov_b32_e32 v2, s11
+ v_mov_b32_e32 v3, s10
+ global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+ s_nop 0
+ v_mov_b32_e32 v0, s9
+ v_mov_b32_e32 v1, s8
+ v_mov_b32_e32 v2, s7
+ v_mov_b32_e32 v3, s6
+ global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+ s_nop 0
+ v_mov_b32_e32 v0, s5
+ v_mov_b32_e32 v1, s4
+ v_mov_b32_e32 v2, s3
+ v_mov_b32_e32 v3, s2
+ global_store_dwordx4 v4, v[0:3], s[0:1]
+ s_endpgm
+.Lfunc_end4:
+ .size s_add_v16i32, .Lfunc_end4-s_add_v16i32
+ ; -- End function
+ .set s_add_v16i32.num_vgpr, 5
+ .set s_add_v16i32.num_agpr, 0
+ .set s_add_v16i32.numbered_sgpr, 52
+ .set s_add_v16i32.num_named_barrier, 0
+ .set s_add_v16i32.private_seg_size, 0
+ .set s_add_v16i32.uses_vcc, 0
+ .set s_add_v16i32.uses_flat_scratch, 0
+ .set s_add_v16i32.has_dyn_sized_stack, 0
+ .set s_add_v16i32.has_recursion, 0
+ .set s_add_v16i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 208
+; TotalNumSgprs: 56
+; NumVgprs: 5
+; NumAgprs: 0
+; TotalNumVgprs: 5
+; ScratchSize: 0
+; MemoryBound: 1
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 6
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 56
+; NumVGPRsForWavesPerEU: 5
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468864
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl v_add_i32 ; -- Begin function v_add_i32
+ .p2align 8
+ .type v_add_i32,@function
+v_add_i32: ; @v_add_i32
+; %bb.0:
+ s_load_dwordx4 s[0:3], s[4:5], 0x24
+ v_lshlrev_b32_e32 v0, 2, v0
+ s_waitcnt lgkmcnt(0)
+ global_load_dword v1, v0, s[2:3] glc
+ s_waitcnt vmcnt(0)
+ global_load_dword v2, v0, s[2:3] offset:4 glc
+ s_waitcnt vmcnt(0)
+ v_mov_b32_e32 v0, 0
+ v_add_u32_e32 v1, v1, v2
+ global_store_dword v0, v1, s[0:1]
+ s_endpgm
+.Lfunc_end5:
+ .size v_add_i32, .Lfunc_end5-v_add_i32
+ ; -- End function
+ .set v_add_i32.num_vgpr, 3
+ .set v_add_i32.num_agpr, 0
+ .set v_add_i32.numbered_sgpr, 6
+ .set v_add_i32.num_named_barrier, 0
+ .set v_add_i32.private_seg_size, 0
+ .set v_add_i32.uses_vcc, 0
+ .set v_add_i32.uses_flat_scratch, 0
+ .set v_add_i32.has_dyn_sized_stack, 0
+ .set v_add_i32.has_recursion, 0
+ .set v_add_i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 60
+; TotalNumSgprs: 10
+; NumVgprs: 3
+; NumAgprs: 0
+; TotalNumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 10
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 10
+; WaveLimiterHint : 0
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468864
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl v_add_imm_i32 ; -- Begin function v_add_imm_i32
+ .p2align 8
+ .type v_add_imm_i32,@function
+v_add_imm_i32: ; @v_add_imm_i32
+; %bb.0:
+ s_load_dwordx4 s[0:3], s[4:5], 0x24
+ v_lshlrev_b32_e32 v0, 2, v0
+ v_mov_b32_e32 v1, 0
+ s_waitcnt lgkmcnt(0)
+ global_load_dword v0, v0, s[2:3] glc
+ s_waitcnt vmcnt(0)
+ v_add_u32_e32 v0, 0x7b, v0
+ global_store_dword v1, v0, s[0:1]
+ s_endpgm
+.Lfunc_end6:
+ .size v_add_imm_i32, .Lfunc_end6-v_add_imm_i32
+ ; -- End function
+ .set v_add_imm_i32.num_vgpr, 2
+ .set v_add_imm_i32.num_agpr, 0
+ .set v_add_imm_i32.numbered_sgpr, 6
+ .set v_add_imm_i32.num_named_barrier, 0
+ .set v_add_imm_i32.private_seg_size, 0
+ .set v_add_imm_i32.uses_vcc, 0
+ .set v_add_imm_i32.uses_flat_scratch, 0
+ .set v_add_imm_i32.has_dyn_sized_stack, 0
+ .set v_add_imm_i32.has_recursion, 0
+ .set v_add_imm_i32.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 52
+; TotalNumSgprs: 10
+; NumVgprs: 2
+; NumAgprs: 0
+; TotalNumVgprs: 2
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 10
+; NumVGPRsForWavesPerEU: 2
+; Occupancy: 10
+; WaveLimiterHint : 0
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468864
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl add64 ; -- Begin function add64
+ .p2align 8
+ .type add64,@function
+add64: ; @add64
+; %bb.0: ; %entry
+ s_load_dwordx4 s[0:3], s[4:5], 0x24
+ s_load_dwordx2 s[6:7], s[4:5], 0x34
+ v_mov_b32_e32 v2, 0
+ s_waitcnt lgkmcnt(0)
+ s_add_u32 s2, s2, s6
+ s_addc_u32 s3, s3, s7
+ v_mov_b32_e32 v0, s2
+ v_mov_b32_e32 v1, s3
+ global_store_dwordx2 v2, v[0:1], s[0:1]
+ s_endpgm
+.Lfunc_end7:
+ .size add64, .Lfunc_end7-add64
+ ; -- End function
+ .set add64.num_vgpr, 3
+ .set add64.num_agpr, 0
+ .set add64.numbered_sgpr, 8
+ .set add64.num_named_barrier, 0
+ .set add64.private_seg_size, 0
+ .set add64.uses_vcc, 0
+ .set add64.uses_flat_scratch, 0
+ .set add64.has_dyn_sized_stack, 0
+ .set add64.has_recursion, 0
+ .set add64.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 52
+; TotalNumSgprs: 12
+; NumVgprs: 3
+; NumAgprs: 0
+; TotalNumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 1
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 12
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468864
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl add64_sgpr_vgpr ; -- Begin function add64_sgpr_vgpr
+ .p2align 8
+ .type add64_sgpr_vgpr,@function
+add64_sgpr_vgpr: ; @add64_sgpr_vgpr
+; %bb.0: ; %entry
+ s_load_dwordx2 s[6:7], s[4:5], 0x34
+ s_load_dwordx4 s[0:3], s[4:5], 0x24
+ v_mov_b32_e32 v2, 0
+ s_waitcnt lgkmcnt(0)
+ s_load_dwordx2 s[4:5], s[6:7], 0x0
+ s_waitcnt lgkmcnt(0)
+ s_add_u32 s2, s2, s4
+ s_addc_u32 s3, s3, s5
+ v_mov_b32_e32 v0, s2
+ v_mov_b32_e32 v1, s3
+ global_store_dwordx2 v2, v[0:1], s[0:1]
+ s_endpgm
+.Lfunc_end8:
+ .size add64_sgpr_vgpr, .Lfunc_end8-add64_sgpr_vgpr
+ ; -- End function
+ .set add64_sgpr_vgpr.num_vgpr, 3
+ .set add64_sgpr_vgpr.num_agpr, 0
+ .set add64_sgpr_vgpr.numbered_sgpr, 8
+ .set add64_sgpr_vgpr.num_named_barrier, 0
+ .set add64_sgpr_vgpr.private_seg_size, 0
+ .set add64_sgpr_vgpr.uses_vcc, 0
+ .set add64_sgpr_vgpr.uses_flat_scratch, 0
+ .set add64_sgpr_vgpr.has_dyn_sized_stack, 0
+ .set add64_sgpr_vgpr.has_recursion, 0
+ .set add64_sgpr_vgpr.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 64
+; TotalNumSgprs: 12
+; NumVgprs: 3
+; NumAgprs: 0
+; TotalNumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 1
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 12
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 47176
+ .long 11468928
+ .long 47180
+ .long 5008
+ .long 47200
+ .long 0
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl add64_in_branch ; -- Begin function add64_in_branch
+ .p2align 8
+ .type add64_in_branch,@function
+add64_in_branch: ; @add64_in_branch
+; %bb.0: ; %entry
+ s_load_dwordx8 s[8:15], s[4:5], 0x24
+ s_mov_b64 s[2:3], 0
+ s_waitcnt lgkmcnt(0)
+ s_cmp_lg_u64 s[12:13], 0
+ s_cbranch_scc0 .LBB9_4
+; %bb.1: ; %else
+ s_add_u32 s0, s12, s14
+ s_addc_u32 s1, s13, s15
+ s_andn2_b64 vcc, exec, s[2:3]
+ s_cbranch_vccnz .LBB9_3
+.LBB9_2: ; %if
+ s_load_dwordx2 s[0:1], s[10:11], 0x0
+.LBB9_3: ; %endif
+ s_waitcnt lgkmcnt(0)
+ v_mov_b32_e32 v0, s0
+ v_mov_b32_e32 v2, 0
+ v_mov_b32_e32 v1, s1
+ global_store_dwordx2 v2, v[0:1], s[8:9]
+ s_endpgm
+.LBB9_4:
+ ; implicit-def: $sgpr0_sgpr1
+ s_branch .LBB9_2
+.Lfunc_end9:
+ .size add64_in_branch, .Lfunc_end9-add64_in_branch
+ ; -- End function
+ .set add64_in_branch.num_vgpr, 3
+ .set add64_in_branch.num_agpr, 0
+ .set add64_in_branch.numbered_sgpr, 16
+ .set add64_in_branch.num_named_barrier, 0
+ .set add64_in_branch.private_seg_size, 0
+ .set add64_in_branch.uses_vcc, 1
+ .set add64_in_branch.uses_flat_scratch, 0
+ .set add64_in_branch.has_dyn_sized_stack, 0
+ .set add64_in_branch.has_recursion, 0
+ .set add64_in_branch.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 80
+; TotalNumSgprs: 20
+; NumVgprs: 3
+; NumAgprs: 0
+; TotalNumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 2
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 20
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 10
+; WaveLimiterHint : 0
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2
+ .section .AMDGPU.config,"",@progbits
+ .long 45096
+ .long 0
+ .long 165608
+ .long 0
+ .long 45100
+ .long 0
+ .long 165580
+ .long 1
+ .long 165584
+ .long 1
+ .long 4
+ .long 0
+ .long 8
+ .long 0
+ .text
+ .globl add_select_vop3 ; -- Begin function add_select_vop3
+ .p2align 8
+ .type add_select_vop3,@function
+add_select_vop3: ; @add_select_vop3
+; %bb.0:
+ v_add_u32_e32 v0, s0, v0
+ ;;#ASMSTART
+ ; def vcc
+ ;;#ASMEND
+ ds_write_b32 v0, v0
+ ;;#ASMSTART
+ ; use vcc
+ ;;#ASMEND
+ s_endpgm
+.Lfunc_end10:
+ .size add_select_vop3, .Lfunc_end10-add_select_vop3
+ ; -- End function
+ .set add_select_vop3.num_vgpr, 1
+ .set add_select_vop3.num_agpr, 0
+ .set add_select_vop3.numbered_sgpr, 1
+ .set add_select_vop3.num_named_barrier, 0
+ .set add_select_vop3.private_seg_size, 0
+ .set add_select_vop3.uses_vcc, 1
+ .set add_select_vop3.uses_flat_scratch, 0
+ .set add_select_vop3.has_dyn_sized_stack, 0
+ .set add_select_vop3.has_recursion, 0
+ .set add_select_vop3.has_indirect_call, 0
+ .section .AMDGPU.csdata,"",@progbits
+; Kernel info:
+; codeLenInByte = 16
+; TotalNumSgprs: 5
+; NumVgprs: 1
+; NumAgprs: 0
+; TotalNumVgprs: 1
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 0
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 0
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 5
+; NumVGPRsForWavesPerEU: 1
+; Occupancy: 10
+; WaveLimiterHint : 0
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 0
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+ .section .AMDGPU.gpr_maximums,"",@progbits
+ .set amdgpu.max_num_vgpr, 0
+ .set amdgpu.max_num_agpr, 0
+ .set amdgpu.max_num_sgpr, 0
+ .section .AMDGPU.csdata,"",@progbits
+ .section ".note.GNU-stack","",@progbits
+ .amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx908"