diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 46 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir | 132 |
4 files changed, 181 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0976fcc..bbed828 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { } fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); + fixVALUTransCoexecutionHazards(MI); fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); @@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { return true; } +bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled. + !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) { + if (!SIInstrInfo::isTRANS(I)) + return false; + + // RAW: Trans(I) writes, VALU(MI) reads. + Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + + // WAR: Trans(I) reads, VALU(MI) writes. + Register ValuDef = ValuDst->getReg(); + for (const MachineOperand &TransUse : I.explicit_uses()) { + if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg())) + return true; + } + + return false; + }; + + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I); + }; + + const int HasVALU = std::numeric_limits<int>::max(); + if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index bbc5585..ef6ddd8 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -104,6 +104,7 @@ private: bool fixLdsDirectVMEMHazard(MachineInstr *MI); bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); + bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll index 5936d6a..47b2b68 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll @@ -66,6 +66,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) ; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: v_nop ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) ; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l @@ -90,6 +91,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) ; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0 +; GFX12-FAKE16-NEXT: v_nop ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) ; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir new file mode 100644 index 0000000..fa27d68 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1250 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1200 %s + +--- +name: trans_writes_valu_reads_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_writes_valu_reads_hazard + ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_writes_valu_reads_hazard + ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_writes_valu_valu_reads_hazard_covered +body: | + bb.0: + ; GCN-LABEL: name: trans_writes_valu_valu_reads_hazard_covered + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_writes_salu_valu_reads_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_writes_salu_valu_reads_hazard + ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_writes_salu_valu_reads_hazard + ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1200-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_no_hazard + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_reads_valu_writes_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_reads_valu_writes_hazard + ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_reads_valu_writes_hazard + ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec +... + +--- +name: trans_reads_valu_valu_writes_hazard_covered +body: | + bb.0: + ; GCN-LABEL: name: trans_reads_valu_valu_writes_hazard_covered + ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_reads__salu_valu_writes_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_reads__salu_valu_writes_hazard + ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_reads__salu_valu_writes_hazard + ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_writes_trans_reads_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_writes_trans_reads_no_hazard + ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec +... + +--- +name: trans_reads_trans_writes_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_reads_trans_writes_no_hazard + ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec +... |