diff options
author | Jay Foad <jay.foad@amd.com> | 2024-01-18 13:50:59 +0000 |
---|---|---|
committer | Jay Foad <jay.foad@amd.com> | 2024-01-18 13:50:59 +0000 |
commit | 745b1932609e3e976b0d85a7c56f375767f5f900 (patch) | |
tree | aba6af25269867ace710ca603deb655ab0619b76 | |
parent | e5a34f9226ef56669f670dc32661934ee3e56f37 (diff) | |
download | llvm-745b1932609e3e976b0d85a7c56f375767f5f900.zip llvm-745b1932609e3e976b0d85a7c56f375767f5f900.tar.gz llvm-745b1932609e3e976b0d85a7c56f375767f5f900.tar.bz2 |
[AMDGPU] Regenerate tests for #77892 after #77438
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 164 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll | 120 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 48 |
3 files changed, 234 insertions, 98 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index c599c1b..2f29a1a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-SDAG-NEXT: s_endpgm @@ -25,7 +25,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_endpgm @@ -37,20 +37,20 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) @@ -60,16 +60,24 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret @@ -78,19 +86,27 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) @@ -101,7 +117,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 @@ -110,7 +126,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 @@ -122,16 +138,24 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_rtn: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) ret <2 x half> %ret @@ -141,7 +165,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 @@ -150,7 +174,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 @@ -162,16 +186,24 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) ret <2 x i16> %ret @@ -181,7 +213,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -191,7 +223,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -204,16 +236,24 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret <2 x i16> %ret @@ -222,13 +262,21 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] main_body: @@ -239,16 +287,24 @@ main_body: define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16_rtn: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) @@ -295,13 +351,13 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> % ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_ret_offset: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) ret <2 x half> %ret @@ -311,13 +367,13 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_ret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret @@ -327,13 +383,13 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_ret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %r = bitcast <2 x half> %orig to float @@ -364,10 +420,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, < ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret: @@ -375,10 +431,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, < ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) store <2 x bfloat> %orig, ptr null @@ -427,10 +483,10 @@ define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret: @@ -438,10 +494,10 @@ define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %orig = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) store <2 x bfloat> %orig, ptr null diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll index f4a7b20..02e2715 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll @@ -5,7 +5,11 @@ define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -16,7 +20,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -28,7 +36,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -40,7 +52,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -53,7 +69,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -66,7 +86,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -77,7 +101,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -89,7 +117,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -101,7 +133,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -114,7 +150,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -127,7 +167,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -138,7 +182,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -150,7 +198,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -162,7 +214,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -175,7 +231,11 @@ entry: define float @test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -188,7 +248,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -199,7 +263,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -211,7 +279,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -223,7 +295,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -236,7 +312,11 @@ entry: define float @test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) { ; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll index 04767a4..12f9029 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll @@ -6,13 +6,13 @@ define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %out = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -24,13 +24,13 @@ define amdgpu_ps float @atomic_pk_add_f16_1d_v2_nt(<8 x i32> inreg %rsrc, <2 x h ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_nt: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_nt: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %out = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -42,14 +42,14 @@ define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -61,13 +61,13 @@ define amdgpu_ps float @atomic_pk_add_f16_1d_v4(<8 x i32> inreg %rsrc, <4 x half ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %out = call <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -80,14 +80,14 @@ define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -101,10 +101,10 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfl ; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2: @@ -112,10 +112,10 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfl ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %out = call <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -127,14 +127,14 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -148,10 +148,10 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl ; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4: @@ -169,10 +169,10 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %out = call <4 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4bf16.v4bf16(<4 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -184,7 +184,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; @@ -201,7 +201,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 ; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -213,7 +213,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; @@ -230,7 +230,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x ; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: |