aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2025-12-01 11:46:30 -0800
committerGitHub <noreply@github.com>2025-12-01 11:46:30 -0800
commitfffe9bcbc7d5d93872ad00a7f212483d749ae71d (patch)
tree5142a7aa8518620e178a2a2b97089d45298a57ca
parent00276b67d36a665119a6a7b39dbba69f45c44e58 (diff)
downloadllvm-fffe9bcbc7d5d93872ad00a7f212483d749ae71d.tar.gz
llvm-fffe9bcbc7d5d93872ad00a7f212483d749ae71d.tar.bz2
llvm-fffe9bcbc7d5d93872ad00a7f212483d749ae71d.zip
[AMDGPU] Allow hazard checks for WMMA co-exec (#168805)
Now we are just inserting V_NOP instrtuctions, try to schedule something into the shadow. It is still somewhat imprecise, for example AdvanceCycle() will use TII.getNumWaitStates() anyway, but in a scheduling mode we are not required to be precise. We must be finally precise in the hazard recognizer mode. Then EmittedInstrs buffer is also limited to MaxLookAhead even though VALU only hazards may actually never expire and require an endless buffer. But that's OK, we can at least mitigate what the buffer can hold. The buffer is also currently much bigger than any of VALU hazards may need. That said the rest of the 'fix*' functions here can be changed the same way, these which are using V_NOPs. This one is just the worst because it may require up to 9 nops.
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp6
-rw-r--r--llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir56
2 files changed, 62 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 2edeced74c13..6f1a5210fb7e 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -190,6 +190,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (checkFPAtomicToDenormModeHazard(MI) > 0)
return HazardType;
+ // Hazards which cannot be mitigated with S_NOPs.
+ if (!IsHazardRecognizerMode) {
+ if (checkWMMACoexecutionHazards(MI) > 0)
+ return Hazard;
+ }
+
if (ST.hasNoDataDepHazard())
return NoHazard;
diff --git a/llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir b/llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir
new file mode 100644
index 000000000000..e3c8acc837f0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir
@@ -0,0 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=postmisched,post-RA-hazard-rec %s -o - | FileCheck --check-prefix=GCN %s
+
+# Bring all independent V_LSHL_ADD_U32_e64 instructions into the shadow
+# of the WMMA so then hazard recognizer only need to insert 4 V_NOP_e32
+# instructions instead of 8.
+
+---
+name: test_wmma_scale_f32_16x16x128_f8f6f4_shadow_sched
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+ ; GCN-LABEL: name: test_wmma_scale_f32_16x16x128_f8f6f4_shadow_sched
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr46 = V_LSHL_ADD_U32_e64 killed $vgpr43, 1, $vgpr43, implicit $exec
+ ; GCN-NEXT: $vgpr47 = V_LSHL_ADD_U32_e64 killed $vgpr42, 1, $vgpr42, implicit $exec
+ ; GCN-NEXT: $vgpr48 = V_LSHL_ADD_U32_e64 killed $vgpr41, 1, $vgpr41, implicit $exec
+ ; GCN-NEXT: $vgpr49 = V_LSHL_ADD_U32_e64 killed $vgpr40, 1, $vgpr40, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 1065353216, killed $vgpr4, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_ADD_F32_e32 1065353216, killed $vgpr5, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_ADD_F32_e32 1065353216, killed $vgpr6, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr7 = V_ADD_F32_e32 1065353216, killed $vgpr7, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr8 = V_ADD_F32_e32 1065353216, killed $vgpr8, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr9 = V_ADD_F32_e32 1065353216, killed $vgpr9, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_ADD_F32_e32 1065353216, killed $vgpr10, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr11 = V_ADD_F32_e32 1065353216, killed $vgpr11, implicit $mode, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, 32, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX4 killed renamable $vgpr44_vgpr45, killed renamable $vgpr46_vgpr47_vgpr48_vgpr49, 64, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr4 = V_ADD_F32_e32 1065353216, $vgpr4, implicit $mode, implicit $exec
+ $vgpr5 = V_ADD_F32_e32 1065353216, $vgpr5, implicit $mode, implicit $exec
+ $vgpr6 = V_ADD_F32_e32 1065353216, $vgpr6, implicit $mode, implicit $exec
+ $vgpr7 = V_ADD_F32_e32 1065353216, $vgpr7, implicit $mode, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 1065353216, $vgpr8, implicit $mode, implicit $exec
+ $vgpr9 = V_ADD_F32_e32 1065353216, $vgpr9, implicit $mode, implicit $exec
+ $vgpr10 = V_ADD_F32_e32 1065353216, $vgpr10, implicit $mode, implicit $exec
+ $vgpr11 = V_ADD_F32_e32 1065353216, $vgpr11, implicit $mode, implicit $exec
+ $vgpr46 = V_LSHL_ADD_U32_e64 $vgpr43, 1, $vgpr43, implicit $exec
+ $vgpr47 = V_LSHL_ADD_U32_e64 $vgpr42, 1, $vgpr42, implicit $exec
+ $vgpr48 = V_LSHL_ADD_U32_e64 $vgpr41, 1, $vgpr41, implicit $exec
+ $vgpr49 = V_LSHL_ADD_U32_e64 $vgpr40, 1, $vgpr40, implicit $exec
+ GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, 32, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr46_vgpr47_vgpr48_vgpr49, 64, 0, implicit $exec
+ S_ENDPGM 0
+...