diff options
author | Austin Kerbow <Austin.Kerbow@amd.com> | 2025-02-25 21:46:05 -0800 |
---|---|---|
committer | Austin Kerbow <Austin.Kerbow@amd.com> | 2025-09-15 22:51:53 -0700 |
commit | d57286a6889d75d7dbc2cfebb55b45ee4268f018 (patch) | |
tree | 8902c7ed3c8c90f09ea0fd18deb2754b4be400d4 | |
parent | 450737fba25203c8dfeca036925e9af9d6c22f0a (diff) | |
download | llvm-users/kerbowa/amdgpu-load-lat-scale.zip llvm-users/kerbowa/amdgpu-load-lat-scale.tar.gz llvm-users/kerbowa/amdgpu-load-lat-scale.tar.bz2 |
[AMDGPU] Dynamically set load latency in the schedulerusers/kerbowa/amdgpu-load-lat-scale
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 34 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 59 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 39 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/res | 793 |
5 files changed, 931 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 254b75b..ae553da 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1150,6 +1150,40 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { GCNSchedStage::finalizeGCNSchedStage(); } +bool ILPInitialScheduleStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor(); + OriginalDSReadLatencyScaleFactor = TII->getDSReadLatencyScaleFactor(); + OriginalVMEMLoadLatencyScaleFactor = TII->getVMEMLoadLatencyScaleFactor(); + const unsigned ILPLoadLatencyScaleFactorDefault = 300; + if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor()) + TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + if (ILPLoadLatencyScaleFactorDefault > TII->getDSReadLatencyScaleFactor()) + TII->setDSReadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + if (ILPLoadLatencyScaleFactorDefault > TII->getVMEMLoadLatencyScaleFactor()) + TII->setVMEMLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + + LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to " + << TII->getLoadLatencyScaleFactor() << '\n'); + return true; +} + +void ILPInitialScheduleStage::finalizeGCNSchedStage() { + const SIInstrInfo *TII = ST.getInstrInfo(); + TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor); + TII->setDSReadLatencyScaleFactor(OriginalDSReadLatencyScaleFactor); + TII->setVMEMLoadLatencyScaleFactor(OriginalVMEMLoadLatencyScaleFactor); + + LLVM_DEBUG( + dbgs() << "ILP Initial Schedule: Restored load latency scale factor to " + << OriginalLoadLatencyScaleFactor << "\n"); + + GCNSchedStage::finalizeGCNSchedStage(); +} + bool GCNSchedStage::initGCNRegion() { // Check whether this new region is also a new block. if (DAG.RegionBegin->getParent() != CurrentMBB) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 790370f..5be6d4b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -515,8 +515,15 @@ public: }; class ILPInitialScheduleStage : public GCNSchedStage { +private: + unsigned OriginalLoadLatencyScaleFactor = 0; + unsigned OriginalDSReadLatencyScaleFactor = 0; + unsigned OriginalVMEMLoadLatencyScaleFactor = 0; + public: bool shouldRevertScheduling(unsigned WavesAfter) override; + bool initGCNSchedStage() override; + void finalizeGCNSchedStage() override; ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5106478..a35aabd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -62,9 +62,29 @@ static cl::opt<bool> Fix16BitCopies( cl::init(true), cl::ReallyHidden); +static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor( + "amdgpu-load-latency-scale-factor", + cl::desc("Scale factor for load instruction latency. Final latency is " + "scalled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + +static cl::opt<unsigned> AMDGPUDSReadLatencyScaleFactor( + "amdgpu-ds-read-latency-scale-factor", + cl::desc("Scale factor for LDS (DS) read instruction latency. Final " + "latency is scaled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + +static cl::opt<unsigned> AMDGPUVMEMLoadLatencyScaleFactor( + "amdgpu-vmem-load-latency-scale-factor", + cl::desc("Scale factor for VMEM/BUFFER/FLAT load instruction latency. " + "Final latency is scaled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), - RI(ST), ST(ST) { + RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor), + DSReadLatencyScaleFactor(AMDGPUDSReadLatencyScaleFactor), + VMEMLoadLatencyScaleFactor(AMDGPUVMEMLoadLatencyScaleFactor) { SchedModel.init(&ST); } @@ -10240,6 +10260,43 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return SchedModel.computeInstrLatency(&MI); } +std::optional<unsigned> +SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel, + const MachineInstr &MI) const { + auto LatencyOpt = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI); + if (!LatencyOpt) + return std::nullopt; + unsigned Latency = *LatencyOpt; + if (MI.mayLoad()) { + unsigned Scale = LoadLatencyScaleFactor; + if (isDS(MI)) + Scale = DSReadLatencyScaleFactor; + else if (isVMEM(MI) || isFLAT(MI)) + Scale = VMEMLoadLatencyScaleFactor; + Latency = (Latency * Scale) / 100; + } + return Latency; +} + +std::optional<unsigned> SIInstrInfo::getOperandLatency( + const TargetSchedModel &SchedModel, const MachineInstr *DefMI, + unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const { + auto LatOpt = TargetInstrInfo::getOperandLatency( + SchedModel, DefMI, DefOperIdx, UseMI, UseOperIdx); + if (!LatOpt) + return std::nullopt; + unsigned Latency = *LatOpt; + if (DefMI && DefMI->mayLoad()) { + unsigned Scale = LoadLatencyScaleFactor; + if (isDS(*DefMI)) + Scale = DSReadLatencyScaleFactor; + else if (isVMEM(*DefMI) || isFLAT(*DefMI)) + Scale = VMEMLoadLatencyScaleFactor; + Latency = (Latency * Scale) / 100; + } + return Latency; +} + InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dffb3d7..e01e303 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -93,6 +93,13 @@ private: const GCNSubtarget &ST; TargetSchedModel SchedModel; mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter; + // Final load latency in the machine model is scalled by + // `Factor / 100 * Latency` + mutable unsigned LoadLatencyScaleFactor = 100; + // Separate scale factor for LDS (DS) read operations. + mutable unsigned DSReadLatencyScaleFactor = 100; + // Separate scale factor for VMEM/BUFFER/FLAT loads. + mutable unsigned VMEMLoadLatencyScaleFactor = 100; // The inverse predicate should have the negative value. enum BranchPredicate { @@ -111,6 +118,38 @@ private: static BranchPredicate getBranchPredicate(unsigned Opcode); public: + void setLoadLatencyScaleFactor(unsigned Factor) const { + LoadLatencyScaleFactor = Factor; + } + + unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; } + + // Control DS read (LDS) latency scaling independently when desired. + void setDSReadLatencyScaleFactor(unsigned Factor) const { + DSReadLatencyScaleFactor = Factor; + } + unsigned getDSReadLatencyScaleFactor() const { + return DSReadLatencyScaleFactor; + } + + // Control VMEM/BUFFER/FLAT load latency scaling independently. + void setVMEMLoadLatencyScaleFactor(unsigned Factor) const { + VMEMLoadLatencyScaleFactor = Factor; + } + unsigned getVMEMLoadLatencyScaleFactor() const { + return VMEMLoadLatencyScaleFactor; + } + + // TargetSchedModel latency hooks. + std::optional<unsigned> + getInstrLatency(const TargetSchedModel &TargetSchedModel, + const MachineInstr &MI) const override; + std::optional<unsigned> getOperandLatency(const TargetSchedModel &SchedModel, + const MachineInstr *DefMI, + unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, diff --git a/llvm/test/CodeGen/AMDGPU/res b/llvm/test/CodeGen/AMDGPU/res new file mode 100644 index 0000000..a190b03 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/res @@ -0,0 +1,793 @@ + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468864 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl s_add_i32 ; -- Begin function s_add_i32 + .p2align 8 + .type s_add_i32,@function +s_add_i32: ; @s_add_i32 +; %bb.0: + s_load_dwordx4 s[0:3], s[4:5], 0x24 + v_mov_b32_e32 v0, 0 + s_waitcnt lgkmcnt(0) + s_load_dwordx2 s[4:5], s[2:3], 0x0 + s_waitcnt lgkmcnt(0) + s_add_i32 s2, s4, s5 + v_mov_b32_e32 v1, s2 + global_store_dword v0, v1, s[0:1] + s_endpgm +.Lfunc_end0: + .size s_add_i32, .Lfunc_end0-s_add_i32 + ; -- End function + .set s_add_i32.num_vgpr, 2 + .set s_add_i32.num_agpr, 0 + .set s_add_i32.numbered_sgpr, 6 + .set s_add_i32.num_named_barrier, 0 + .set s_add_i32.private_seg_size, 0 + .set s_add_i32.uses_vcc, 0 + .set s_add_i32.uses_flat_scratch, 0 + .set s_add_i32.has_dyn_sized_stack, 0 + .set s_add_i32.has_recursion, 0 + .set s_add_i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 48 +; TotalNumSgprs: 10 +; NumVgprs: 2 +; NumAgprs: 0 +; TotalNumVgprs: 2 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 10 +; NumVGPRsForWavesPerEU: 2 +; Occupancy: 10 +; WaveLimiterHint : 0 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468864 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl s_add_v2i32 ; -- Begin function s_add_v2i32 + .p2align 8 + .type s_add_v2i32,@function +s_add_v2i32: ; @s_add_v2i32 +; %bb.0: + s_load_dwordx4 s[0:3], s[4:5], 0x24 + v_mov_b32_e32 v2, 0 + s_waitcnt lgkmcnt(0) + s_load_dwordx4 s[4:7], s[2:3], 0x0 + s_waitcnt lgkmcnt(0) + s_add_i32 s2, s5, s7 + s_add_i32 s3, s4, s6 + v_mov_b32_e32 v0, s3 + v_mov_b32_e32 v1, s2 + global_store_dwordx2 v2, v[0:1], s[0:1] + s_endpgm +.Lfunc_end1: + .size s_add_v2i32, .Lfunc_end1-s_add_v2i32 + ; -- End function + .set s_add_v2i32.num_vgpr, 3 + .set s_add_v2i32.num_agpr, 0 + .set s_add_v2i32.numbered_sgpr, 8 + .set s_add_v2i32.num_named_barrier, 0 + .set s_add_v2i32.private_seg_size, 0 + .set s_add_v2i32.uses_vcc, 0 + .set s_add_v2i32.uses_flat_scratch, 0 + .set s_add_v2i32.has_dyn_sized_stack, 0 + .set s_add_v2i32.has_recursion, 0 + .set s_add_v2i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 56 +; TotalNumSgprs: 12 +; NumVgprs: 3 +; NumAgprs: 0 +; TotalNumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 1 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 12 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468865 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl s_add_v4i32 ; -- Begin function s_add_v4i32 + .p2align 8 + .type s_add_v4i32,@function +s_add_v4i32: ; @s_add_v4i32 +; %bb.0: + s_load_dwordx4 s[8:11], s[4:5], 0x24 + v_mov_b32_e32 v4, 0 + s_waitcnt lgkmcnt(0) + s_load_dwordx8 s[0:7], s[10:11], 0x0 + s_waitcnt lgkmcnt(0) + s_add_i32 s3, s3, s7 + s_add_i32 s2, s2, s6 + s_add_i32 s1, s1, s5 + s_add_i32 s0, s0, s4 + v_mov_b32_e32 v0, s0 + v_mov_b32_e32 v1, s1 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + global_store_dwordx4 v4, v[0:3], s[8:9] + s_endpgm +.Lfunc_end2: + .size s_add_v4i32, .Lfunc_end2-s_add_v4i32 + ; -- End function + .set s_add_v4i32.num_vgpr, 5 + .set s_add_v4i32.num_agpr, 0 + .set s_add_v4i32.numbered_sgpr, 12 + .set s_add_v4i32.num_named_barrier, 0 + .set s_add_v4i32.private_seg_size, 0 + .set s_add_v4i32.uses_vcc, 0 + .set s_add_v4i32.uses_flat_scratch, 0 + .set s_add_v4i32.has_dyn_sized_stack, 0 + .set s_add_v4i32.has_recursion, 0 + .set s_add_v4i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 72 +; TotalNumSgprs: 16 +; NumVgprs: 5 +; NumAgprs: 0 +; TotalNumVgprs: 5 +; ScratchSize: 0 +; MemoryBound: 1 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 16 +; NumVGPRsForWavesPerEU: 5 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468993 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl s_add_v8i32 ; -- Begin function s_add_v8i32 + .p2align 8 + .type s_add_v8i32,@function +s_add_v8i32: ; @s_add_v8i32 +; %bb.0: ; %entry + s_load_dwordx16 s[8:23], s[4:5], 0x44 + s_load_dwordx2 s[0:1], s[4:5], 0x24 + v_mov_b32_e32 v4, 0 + s_waitcnt lgkmcnt(0) + s_add_i32 s4, s9, s17 + s_add_i32 s5, s8, s16 + s_add_i32 s6, s15, s23 + s_add_i32 s7, s14, s22 + s_add_i32 s8, s13, s21 + s_add_i32 s9, s12, s20 + s_add_i32 s2, s11, s19 + s_add_i32 s3, s10, s18 + v_mov_b32_e32 v0, s9 + v_mov_b32_e32 v1, s8 + v_mov_b32_e32 v2, s7 + v_mov_b32_e32 v3, s6 + global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 + s_nop 0 + v_mov_b32_e32 v0, s5 + v_mov_b32_e32 v1, s4 + v_mov_b32_e32 v2, s3 + v_mov_b32_e32 v3, s2 + global_store_dwordx4 v4, v[0:3], s[0:1] + s_endpgm +.Lfunc_end3: + .size s_add_v8i32, .Lfunc_end3-s_add_v8i32 + ; -- End function + .set s_add_v8i32.num_vgpr, 5 + .set s_add_v8i32.num_agpr, 0 + .set s_add_v8i32.numbered_sgpr, 24 + .set s_add_v8i32.num_named_barrier, 0 + .set s_add_v8i32.private_seg_size, 0 + .set s_add_v8i32.uses_vcc, 0 + .set s_add_v8i32.uses_flat_scratch, 0 + .set s_add_v8i32.has_dyn_sized_stack, 0 + .set s_add_v8i32.has_recursion, 0 + .set s_add_v8i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 112 +; TotalNumSgprs: 28 +; NumVgprs: 5 +; NumAgprs: 0 +; TotalNumVgprs: 5 +; ScratchSize: 0 +; MemoryBound: 1 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 3 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 28 +; NumVGPRsForWavesPerEU: 5 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11469185 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl s_add_v16i32 ; -- Begin function s_add_v16i32 + .p2align 8 + .type s_add_v16i32,@function +s_add_v16i32: ; @s_add_v16i32 +; %bb.0: ; %entry + s_load_dwordx16 s[8:23], s[4:5], 0x64 + s_load_dwordx16 s[36:51], s[4:5], 0xa4 + s_load_dwordx2 s[0:1], s[4:5], 0x24 + v_mov_b32_e32 v4, 0 + s_waitcnt lgkmcnt(0) + s_add_i32 s4, s9, s37 + s_add_i32 s5, s8, s36 + s_add_i32 s6, s15, s43 + s_add_i32 s7, s14, s42 + s_add_i32 s8, s13, s41 + s_add_i32 s9, s12, s40 + s_add_i32 s12, s17, s45 + s_add_i32 s13, s16, s44 + s_add_i32 s14, s23, s51 + s_add_i32 s15, s22, s50 + s_add_i32 s16, s21, s49 + s_add_i32 s17, s20, s48 + s_add_i32 s2, s11, s39 + s_add_i32 s3, s10, s38 + s_add_i32 s10, s19, s47 + s_add_i32 s11, s18, s46 + v_mov_b32_e32 v0, s17 + v_mov_b32_e32 v1, s16 + v_mov_b32_e32 v2, s15 + v_mov_b32_e32 v3, s14 + global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 + s_nop 0 + v_mov_b32_e32 v0, s13 + v_mov_b32_e32 v1, s12 + v_mov_b32_e32 v2, s11 + v_mov_b32_e32 v3, s10 + global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 + s_nop 0 + v_mov_b32_e32 v0, s9 + v_mov_b32_e32 v1, s8 + v_mov_b32_e32 v2, s7 + v_mov_b32_e32 v3, s6 + global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 + s_nop 0 + v_mov_b32_e32 v0, s5 + v_mov_b32_e32 v1, s4 + v_mov_b32_e32 v2, s3 + v_mov_b32_e32 v3, s2 + global_store_dwordx4 v4, v[0:3], s[0:1] + s_endpgm +.Lfunc_end4: + .size s_add_v16i32, .Lfunc_end4-s_add_v16i32 + ; -- End function + .set s_add_v16i32.num_vgpr, 5 + .set s_add_v16i32.num_agpr, 0 + .set s_add_v16i32.numbered_sgpr, 52 + .set s_add_v16i32.num_named_barrier, 0 + .set s_add_v16i32.private_seg_size, 0 + .set s_add_v16i32.uses_vcc, 0 + .set s_add_v16i32.uses_flat_scratch, 0 + .set s_add_v16i32.has_dyn_sized_stack, 0 + .set s_add_v16i32.has_recursion, 0 + .set s_add_v16i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 208 +; TotalNumSgprs: 56 +; NumVgprs: 5 +; NumAgprs: 0 +; TotalNumVgprs: 5 +; ScratchSize: 0 +; MemoryBound: 1 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 6 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 56 +; NumVGPRsForWavesPerEU: 5 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468864 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl v_add_i32 ; -- Begin function v_add_i32 + .p2align 8 + .type v_add_i32,@function +v_add_i32: ; @v_add_i32 +; %bb.0: + s_load_dwordx4 s[0:3], s[4:5], 0x24 + v_lshlrev_b32_e32 v0, 2, v0 + s_waitcnt lgkmcnt(0) + global_load_dword v1, v0, s[2:3] glc + s_waitcnt vmcnt(0) + global_load_dword v2, v0, s[2:3] offset:4 glc + s_waitcnt vmcnt(0) + v_mov_b32_e32 v0, 0 + v_add_u32_e32 v1, v1, v2 + global_store_dword v0, v1, s[0:1] + s_endpgm +.Lfunc_end5: + .size v_add_i32, .Lfunc_end5-v_add_i32 + ; -- End function + .set v_add_i32.num_vgpr, 3 + .set v_add_i32.num_agpr, 0 + .set v_add_i32.numbered_sgpr, 6 + .set v_add_i32.num_named_barrier, 0 + .set v_add_i32.private_seg_size, 0 + .set v_add_i32.uses_vcc, 0 + .set v_add_i32.uses_flat_scratch, 0 + .set v_add_i32.has_dyn_sized_stack, 0 + .set v_add_i32.has_recursion, 0 + .set v_add_i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 60 +; TotalNumSgprs: 10 +; NumVgprs: 3 +; NumAgprs: 0 +; TotalNumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 10 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 10 +; WaveLimiterHint : 0 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468864 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl v_add_imm_i32 ; -- Begin function v_add_imm_i32 + .p2align 8 + .type v_add_imm_i32,@function +v_add_imm_i32: ; @v_add_imm_i32 +; %bb.0: + s_load_dwordx4 s[0:3], s[4:5], 0x24 + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + global_load_dword v0, v0, s[2:3] glc + s_waitcnt vmcnt(0) + v_add_u32_e32 v0, 0x7b, v0 + global_store_dword v1, v0, s[0:1] + s_endpgm +.Lfunc_end6: + .size v_add_imm_i32, .Lfunc_end6-v_add_imm_i32 + ; -- End function + .set v_add_imm_i32.num_vgpr, 2 + .set v_add_imm_i32.num_agpr, 0 + .set v_add_imm_i32.numbered_sgpr, 6 + .set v_add_imm_i32.num_named_barrier, 0 + .set v_add_imm_i32.private_seg_size, 0 + .set v_add_imm_i32.uses_vcc, 0 + .set v_add_imm_i32.uses_flat_scratch, 0 + .set v_add_imm_i32.has_dyn_sized_stack, 0 + .set v_add_imm_i32.has_recursion, 0 + .set v_add_imm_i32.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 52 +; TotalNumSgprs: 10 +; NumVgprs: 2 +; NumAgprs: 0 +; TotalNumVgprs: 2 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 10 +; NumVGPRsForWavesPerEU: 2 +; Occupancy: 10 +; WaveLimiterHint : 0 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468864 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl add64 ; -- Begin function add64 + .p2align 8 + .type add64,@function +add64: ; @add64 +; %bb.0: ; %entry + s_load_dwordx4 s[0:3], s[4:5], 0x24 + s_load_dwordx2 s[6:7], s[4:5], 0x34 + v_mov_b32_e32 v2, 0 + s_waitcnt lgkmcnt(0) + s_add_u32 s2, s2, s6 + s_addc_u32 s3, s3, s7 + v_mov_b32_e32 v0, s2 + v_mov_b32_e32 v1, s3 + global_store_dwordx2 v2, v[0:1], s[0:1] + s_endpgm +.Lfunc_end7: + .size add64, .Lfunc_end7-add64 + ; -- End function + .set add64.num_vgpr, 3 + .set add64.num_agpr, 0 + .set add64.numbered_sgpr, 8 + .set add64.num_named_barrier, 0 + .set add64.private_seg_size, 0 + .set add64.uses_vcc, 0 + .set add64.uses_flat_scratch, 0 + .set add64.has_dyn_sized_stack, 0 + .set add64.has_recursion, 0 + .set add64.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 52 +; TotalNumSgprs: 12 +; NumVgprs: 3 +; NumAgprs: 0 +; TotalNumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 1 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 12 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468864 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl add64_sgpr_vgpr ; -- Begin function add64_sgpr_vgpr + .p2align 8 + .type add64_sgpr_vgpr,@function +add64_sgpr_vgpr: ; @add64_sgpr_vgpr +; %bb.0: ; %entry + s_load_dwordx2 s[6:7], s[4:5], 0x34 + s_load_dwordx4 s[0:3], s[4:5], 0x24 + v_mov_b32_e32 v2, 0 + s_waitcnt lgkmcnt(0) + s_load_dwordx2 s[4:5], s[6:7], 0x0 + s_waitcnt lgkmcnt(0) + s_add_u32 s2, s2, s4 + s_addc_u32 s3, s3, s5 + v_mov_b32_e32 v0, s2 + v_mov_b32_e32 v1, s3 + global_store_dwordx2 v2, v[0:1], s[0:1] + s_endpgm +.Lfunc_end8: + .size add64_sgpr_vgpr, .Lfunc_end8-add64_sgpr_vgpr + ; -- End function + .set add64_sgpr_vgpr.num_vgpr, 3 + .set add64_sgpr_vgpr.num_agpr, 0 + .set add64_sgpr_vgpr.numbered_sgpr, 8 + .set add64_sgpr_vgpr.num_named_barrier, 0 + .set add64_sgpr_vgpr.private_seg_size, 0 + .set add64_sgpr_vgpr.uses_vcc, 0 + .set add64_sgpr_vgpr.uses_flat_scratch, 0 + .set add64_sgpr_vgpr.has_dyn_sized_stack, 0 + .set add64_sgpr_vgpr.has_recursion, 0 + .set add64_sgpr_vgpr.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 64 +; TotalNumSgprs: 12 +; NumVgprs: 3 +; NumAgprs: 0 +; TotalNumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 1 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 12 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 47176 + .long 11468928 + .long 47180 + .long 5008 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl add64_in_branch ; -- Begin function add64_in_branch + .p2align 8 + .type add64_in_branch,@function +add64_in_branch: ; @add64_in_branch +; %bb.0: ; %entry + s_load_dwordx8 s[8:15], s[4:5], 0x24 + s_mov_b64 s[2:3], 0 + s_waitcnt lgkmcnt(0) + s_cmp_lg_u64 s[12:13], 0 + s_cbranch_scc0 .LBB9_4 +; %bb.1: ; %else + s_add_u32 s0, s12, s14 + s_addc_u32 s1, s13, s15 + s_andn2_b64 vcc, exec, s[2:3] + s_cbranch_vccnz .LBB9_3 +.LBB9_2: ; %if + s_load_dwordx2 s[0:1], s[10:11], 0x0 +.LBB9_3: ; %endif + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v0, s0 + v_mov_b32_e32 v2, 0 + v_mov_b32_e32 v1, s1 + global_store_dwordx2 v2, v[0:1], s[8:9] + s_endpgm +.LBB9_4: + ; implicit-def: $sgpr0_sgpr1 + s_branch .LBB9_2 +.Lfunc_end9: + .size add64_in_branch, .Lfunc_end9-add64_in_branch + ; -- End function + .set add64_in_branch.num_vgpr, 3 + .set add64_in_branch.num_agpr, 0 + .set add64_in_branch.numbered_sgpr, 16 + .set add64_in_branch.num_named_barrier, 0 + .set add64_in_branch.private_seg_size, 0 + .set add64_in_branch.uses_vcc, 1 + .set add64_in_branch.uses_flat_scratch, 0 + .set add64_in_branch.has_dyn_sized_stack, 0 + .set add64_in_branch.has_recursion, 0 + .set add64_in_branch.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 80 +; TotalNumSgprs: 20 +; NumVgprs: 3 +; NumAgprs: 0 +; TotalNumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 2 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 20 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 10 +; WaveLimiterHint : 0 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 2 + .section .AMDGPU.config,"",@progbits + .long 45096 + .long 0 + .long 165608 + .long 0 + .long 45100 + .long 0 + .long 165580 + .long 1 + .long 165584 + .long 1 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl add_select_vop3 ; -- Begin function add_select_vop3 + .p2align 8 + .type add_select_vop3,@function +add_select_vop3: ; @add_select_vop3 +; %bb.0: + v_add_u32_e32 v0, s0, v0 + ;;#ASMSTART + ; def vcc + ;;#ASMEND + ds_write_b32 v0, v0 + ;;#ASMSTART + ; use vcc + ;;#ASMEND + s_endpgm +.Lfunc_end10: + .size add_select_vop3, .Lfunc_end10-add_select_vop3 + ; -- End function + .set add_select_vop3.num_vgpr, 1 + .set add_select_vop3.num_agpr, 0 + .set add_select_vop3.numbered_sgpr, 1 + .set add_select_vop3.num_named_barrier, 0 + .set add_select_vop3.private_seg_size, 0 + .set add_select_vop3.uses_vcc, 1 + .set add_select_vop3.uses_flat_scratch, 0 + .set add_select_vop3.has_dyn_sized_stack, 0 + .set add_select_vop3.has_recursion, 0 + .set add_select_vop3.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 16 +; TotalNumSgprs: 5 +; NumVgprs: 1 +; NumAgprs: 0 +; TotalNumVgprs: 1 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 0 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 0 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 5 +; NumVGPRsForWavesPerEU: 1 +; Occupancy: 10 +; WaveLimiterHint : 0 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 0 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.gpr_maximums,"",@progbits + .set amdgpu.max_num_vgpr, 0 + .set amdgpu.max_num_agpr, 0 + .set amdgpu.max_num_sgpr, 0 + .section .AMDGPU.csdata,"",@progbits + .section ".note.GNU-stack","",@progbits + .amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx908" |