diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
34 files changed, 4658 insertions, 1470 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 002c03aa..e86f747 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 16240 ; GFX10CU-NEXT: S_WAITCNT_lds_direct + ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release @@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 1015 + ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") release @@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 16240 ; GFX10CU-NEXT: S_WAITCNT_lds_direct + ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel @@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 1015 + ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") acq_rel @@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 16240 ; GFX10CU-NEXT: S_WAITCNT_lds_direct + ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst @@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 1015 + ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") seq_cst @@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 { ; ; GFX10CU-LABEL: name: workgroup_release ; GFX10CU: bb.0.entry: - ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_soft 112 ; GFX10CU-NEXT: S_WAITCNT_lds_direct + ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_release @@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 { ; ; GFX11CU-LABEL: name: workgroup_release ; GFX11CU: bb.0.entry: - ; GFX11CU-NEXT: S_WAITCNT_soft 64519 + ; GFX11CU-NEXT: S_WAITCNT_soft 7 + ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup") release @@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; ; GFX10CU-LABEL: name: workgroup_acq_rel ; GFX10CU: bb.0.entry: - ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_soft 112 ; GFX10CU-NEXT: S_WAITCNT_lds_direct + ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_acq_rel @@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; ; GFX11CU-LABEL: name: workgroup_acq_rel ; GFX11CU: bb.0.entry: - ; GFX11CU-NEXT: S_WAITCNT_soft 64519 + ; GFX11CU-NEXT: S_WAITCNT_soft 7 + ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup") acq_rel @@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; ; GFX10CU-LABEL: name: workgroup_seq_cst ; GFX10CU: bb.0.entry: - ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_soft 112 ; GFX10CU-NEXT: S_WAITCNT_lds_direct + ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_seq_cst @@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; ; GFX11CU-LABEL: name: workgroup_seq_cst ; GFX11CU: bb.0.entry: - ; GFX11CU-NEXT: S_WAITCNT_soft 64519 + ; GFX11CU-NEXT: S_WAITCNT_soft 7 + ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll index 00c6656..b3a7057 100644 --- a/llvm/test/CodeGen/AMDGPU/add-max.ll +++ b/llvm/test/CodeGen/AMDGPU/add-max.ll @@ -5,7 +5,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) @@ -16,7 +16,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_svv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, s0, v0, v1 +; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) @@ -27,7 +27,7 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { ; SDAG-LABEL: add_max_u32_ssv: ; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32_e64 v0, s0, s1, v0 +; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0 ; SDAG-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: add_max_u32_ssv: @@ -59,7 +59,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { ; GCN-LABEL: add_max_u32_vsi: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, v0, s0, 4 +; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 4) @@ -70,7 +70,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { ; GCN-LABEL: add_max_u32_svl: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, s0, v0, 0x64 +; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 100) @@ -81,7 +81,7 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { ; SDAG-LABEL: add_max_u32_slv: ; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32_e64 v0, 0x64, s0, v0 +; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0 ; SDAG-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: add_max_u32_slv: @@ -99,7 +99,7 @@ define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_i32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_i32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.smax.i32(i32 %add, i32 %c) @@ -110,7 +110,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_min_u32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_min_u32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umin.i32(i32 %add, i32 %c) @@ -121,7 +121,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_min_i32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_min_i32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.smin.i32(i32 %add, i32 %c) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 7ee0015f..711d57b 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -39137,7 +39137,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_min_u32_e64 v2, v3, -1, v2 +; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 @@ -39487,8 +39487,8 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4 ; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32_e64 v5, v7, -1, v5 -; GFX1250-NEXT: v_add_min_u32_e64 v4, v6, -1, v4 +; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5 +; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] ; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] @@ -39979,9 +39979,9 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v7, v10, -1, v7 +; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v6, v9, -1, v6 +; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6 ; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] @@ -39991,7 +39991,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v8, v11, -1, v8 +; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8 ; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 @@ -40027,8 +40027,8 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7 ; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v6, v10, -1, v6 -; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v7, v11, -1, v7 +; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6 +; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] ; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] @@ -40038,7 +40038,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v8, v9, -1, v8 +; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 ; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 @@ -40656,18 +40656,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14 -; GFX1250-NEXT: v_add_min_u32_e64 v9, v13, -1, v9 +; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_min_u32_e64 v8, v12, -1, v8 +; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8 ; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5] ; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11 -; GFX1250-NEXT: v_add_min_u32_e64 v10, v14, -1, v10 +; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32_e64 v11, v15, -1, v11 +; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] ; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 ; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 53b2542..142290a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc -; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc -; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 35d178c..6f91222 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -17562,5 +17562,1363 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } + +define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: frem_v2f64_const_zero_num: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_v2f64_const_zero_num: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_and_b64 s[2:3], vcc, exec +; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_v2f64_const_zero_num: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-NEXT: s_cselect_b32 s0, 0x7ff80000, 0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64_const_zero_num: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[3:4] +; GFX9-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64_const_zero_num: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX10-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX10-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64_const_zero_num: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[1:4], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX11-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64_const_zero_num: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: global_load_b128 v[1:4], v0, s[2:3] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX1150-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX1150-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 +; GFX1150-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX1150-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX1150-NEXT: v_mov_b32_e32 v3, s3 +; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64_const_zero_num: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_load_b128 v[1:4], v0, s[2:3] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX1200-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX1200-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 +; GFX1200-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX1200-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v3, s3 +; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm + %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 + %r1 = frem <2 x double> <double 0.0, double 0.0>, %r0 + store <2 x double> %r1, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: frem_v2f64_const_one_denum: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB15_2 +; SI-NEXT: ; %bb.1: ; %frem.else16 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB15_3 +; SI-NEXT: s_branch .LBB15_8 +; SI-NEXT: .LBB15_2: +; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB15_3: ; %frem.compute15 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] +; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v6 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s5, s3, -1 +; SI-NEXT: v_ldexp_f64 v[5:6], v[4:5], 26 +; SI-NEXT: s_cmp_lt_i32 s5, 27 +; SI-NEXT: s_cbranch_scc1 .LBB15_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; SI-NEXT: s_add_i32 s5, s3, 25 +; SI-NEXT: v_mov_b32_e32 v9, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: .LBB15_5: ; %frem.loop_body23 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_bfi_b32 v5, s4, v9, v8 +; SI-NEXT: v_add_f64 v[10:11], v[7:8], v[4:5] +; SI-NEXT: v_add_f64 v[5:6], v[10:11], -v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3] +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_add_f64 v[5:6], v[7:8], -v[5:6] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[5:6] +; SI-NEXT: v_add_f64 v[10:11], v[5:6], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SI-NEXT: v_ldexp_f64 v[5:6], v[5:6], 26 +; SI-NEXT: s_sub_i32 s5, s5, 26 +; SI-NEXT: s_cmp_gt_i32 s5, 26 +; SI-NEXT: s_cbranch_scc1 .LBB15_5 +; SI-NEXT: ; %bb.6: ; %Flow50 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: .LBB15_7: ; %frem.loop_exit24 +; SI-NEXT: s_sub_i32 s2, s5, 25 +; SI-NEXT: v_ldexp_f64 v[4:5], v[5:6], s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 +; SI-NEXT: v_bfi_b32 v7, s2, v6, v5 +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: v_add_f64 v[8:9], v[4:5], v[6:7] +; SI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc +; SI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; SI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; SI-NEXT: .LBB15_8: +; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB15_10 +; SI-NEXT: ; %bb.9: ; %frem.else +; SI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; SI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB15_11 +; SI-NEXT: s_branch .LBB15_16 +; SI-NEXT: .LBB15_10: +; SI-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB15_11: ; %frem.compute +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3] +; SI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v8 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s5, s3, -1 +; SI-NEXT: v_ldexp_f64 v[7:8], v[6:7], 26 +; SI-NEXT: s_cmp_lt_i32 s5, 27 +; SI-NEXT: s_cbranch_scc1 .LBB15_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; SI-NEXT: s_add_i32 s5, s3, 25 +; SI-NEXT: v_mov_b32_e32 v11, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: .LBB15_13: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_bfi_b32 v7, s4, v11, v10 +; SI-NEXT: v_add_f64 v[12:13], v[9:10], v[6:7] +; SI-NEXT: v_add_f64 v[7:8], v[12:13], -v[6:7] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3] +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SI-NEXT: v_add_f64 v[7:8], v[9:10], -v[7:8] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[7:8] +; SI-NEXT: v_add_f64 v[12:13], v[7:8], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; SI-NEXT: v_ldexp_f64 v[7:8], v[7:8], 26 +; SI-NEXT: s_sub_i32 s5, s5, 26 +; SI-NEXT: s_cmp_gt_i32 s5, 26 +; SI-NEXT: s_cbranch_scc1 .LBB15_13 +; SI-NEXT: ; %bb.14: ; %Flow +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: .LBB15_15: ; %frem.loop_exit +; SI-NEXT: s_sub_i32 s2, s5, 25 +; SI-NEXT: v_ldexp_f64 v[6:7], v[7:8], s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v8, 0x43300000 +; SI-NEXT: v_bfi_b32 v9, s2, v8, v7 +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_add_f64 v[10:11], v[6:7], v[8:9] +; SI-NEXT: v_add_f64 v[8:9], v[10:11], -v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SI-NEXT: v_bfi_b32 v7, s2, v7, v3 +; SI-NEXT: .LBB15_16: ; %Flow49 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x7ff00000 +; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_v2f64_const_one_denum: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB15_2 +; CI-NEXT: ; %bb.1: ; %frem.else16 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB15_3 +; CI-NEXT: s_branch .LBB15_8 +; CI-NEXT: .LBB15_2: +; CI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CI-NEXT: .LBB15_3: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 +; CI-NEXT: s_cbranch_vccnz .LBB15_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v8, vcc, 25, v6 +; CI-NEXT: .LBB15_5: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; CI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_subrev_i32_e32 v8, vcc, 26, v8 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 +; CI-NEXT: s_cbranch_vccnz .LBB15_5 +; CI-NEXT: ; %bb.6: ; %Flow50 +; CI-NEXT: v_mov_b32_e32 v4, v6 +; CI-NEXT: v_mov_b32_e32 v5, v7 +; CI-NEXT: .LBB15_7: ; %frem.loop_exit24 +; CI-NEXT: v_subrev_i32_e32 v6, vcc, 25, v8 +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; CI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; CI-NEXT: .LBB15_8: +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB15_10 +; CI-NEXT: ; %bb.9: ; %frem.else +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; CI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB15_11 +; CI-NEXT: s_branch .LBB15_16 +; CI-NEXT: .LBB15_10: +; CI-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CI-NEXT: .LBB15_11: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v8 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 +; CI-NEXT: s_cbranch_vccnz .LBB15_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v10, vcc, 25, v8 +; CI-NEXT: .LBB15_13: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mov_b32_e32 v8, v6 +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; CI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_subrev_i32_e32 v10, vcc, 26, v10 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 +; CI-NEXT: s_cbranch_vccnz .LBB15_13 +; CI-NEXT: ; %bb.14: ; %Flow +; CI-NEXT: v_mov_b32_e32 v6, v8 +; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: .LBB15_15: ; %frem.loop_exit +; CI-NEXT: v_subrev_i32_e32 v8, vcc, 25, v10 +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; CI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CI-NEXT: v_bfi_b32 v7, s2, v7, v3 +; CI-NEXT: .LBB15_16: ; %Flow49 +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: s_mov_b32 s5, 0x7ff00000 +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] +; CI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] +; CI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_v2f64_const_one_denum: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB15_2 +; VI-NEXT: ; %bb.1: ; %frem.else16 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; VI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB15_3 +; VI-NEXT: s_branch .LBB15_8 +; VI-NEXT: .LBB15_2: +; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; VI-NEXT: .LBB15_3: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v6 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 +; VI-NEXT: s_cbranch_vccnz .LBB15_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v8, vcc, 25, v6 +; VI-NEXT: .LBB15_5: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; VI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_subrev_u32_e32 v8, vcc, 26, v8 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 +; VI-NEXT: s_cbranch_vccnz .LBB15_5 +; VI-NEXT: ; %bb.6: ; %Flow50 +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: .LBB15_7: ; %frem.loop_exit24 +; VI-NEXT: v_subrev_u32_e32 v6, vcc, 25, v8 +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; VI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; VI-NEXT: .LBB15_8: +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB15_10 +; VI-NEXT: ; %bb.9: ; %frem.else +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; VI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; VI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB15_11 +; VI-NEXT: s_branch .LBB15_16 +; VI-NEXT: .LBB15_10: +; VI-NEXT: ; implicit-def: $vgpr6_vgpr7 +; VI-NEXT: .LBB15_11: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v8 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 +; VI-NEXT: s_cbranch_vccnz .LBB15_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v10, vcc, 25, v8 +; VI-NEXT: .LBB15_13: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mov_b32_e32 v8, v6 +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; VI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_subrev_u32_e32 v10, vcc, 26, v10 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 +; VI-NEXT: s_cbranch_vccnz .LBB15_13 +; VI-NEXT: ; %bb.14: ; %Flow +; VI-NEXT: v_mov_b32_e32 v6, v8 +; VI-NEXT: v_mov_b32_e32 v7, v9 +; VI-NEXT: .LBB15_15: ; %frem.loop_exit +; VI-NEXT: v_subrev_u32_e32 v8, vcc, 25, v10 +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; VI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_bfi_b32 v7, s2, v7, v3 +; VI-NEXT: .LBB15_16: ; %Flow49 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s3, 0x7ff00000 +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] +; VI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64_const_one_denum: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else16 +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-NEXT: s_branch .LBB15_8 +; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: .LBB15_3: ; %frem.compute15 +; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX9-NEXT: v_add_u32_e32 v8, -1, v6 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX9-NEXT: v_add_u32_e32 v8, 25, v6 +; GFX9-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX9-NEXT: v_subrev_u32_e32 v8, 26, v8 +; GFX9-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX9-NEXT: ; %bb.6: ; %Flow50 +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX9-NEXT: v_subrev_u32_e32 v6, 25, v8 +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1 +; GFX9-NEXT: .LBB15_8: +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB15_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB15_11 +; GFX9-NEXT: s_branch .LBB15_16 +; GFX9-NEXT: .LBB15_10: +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX9-NEXT: .LBB15_11: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX9-NEXT: v_add_u32_e32 v10, -1, v8 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX9-NEXT: v_add_u32_e32 v10, 25, v8 +; GFX9-NEXT: .LBB15_13: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX9-NEXT: v_subrev_u32_e32 v10, 26, v10 +; GFX9-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_13 +; GFX9-NEXT: ; %bb.14: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX9-NEXT: v_subrev_u32_e32 v8, 25, v10 +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_bfi_b32 v7, s2, v7, v3 +; GFX9-NEXT: .LBB15_16: ; %Flow49 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000 +; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64_const_one_denum: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB15_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else16 +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB15_3 +; GFX10-NEXT: s_branch .LBB15_8 +; GFX10-NEXT: .LBB15_2: +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB15_3: ; %frem.compute15 +; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX10-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX10-NEXT: v_readfirstlane_b32 s2, v6 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX10-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX10-NEXT: s_add_i32 s2, s2, 25 +; GFX10-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX10-NEXT: ; %bb.6: ; %Flow50 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX10-NEXT: .LBB15_8: +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB15_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX10-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB15_11 +; GFX10-NEXT: s_branch .LBB15_16 +; GFX10-NEXT: .LBB15_10: +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX10-NEXT: .LBB15_11: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX10-NEXT: v_readfirstlane_b32 s2, v8 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX10-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX10-NEXT: s_add_i32 s2, s2, 25 +; GFX10-NEXT: .LBB15_13: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX10-NEXT: ; %bb.14: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, v9 +; GFX10-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX10-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX10-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX10-NEXT: .LBB15_16: ; %Flow49 +; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64_const_one_denum: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else16 +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX11-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB15_3 +; GFX11-NEXT: s_branch .LBB15_8 +; GFX11-NEXT: .LBB15_2: +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB15_3: ; %frem.compute15 +; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX11-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX11-NEXT: s_add_i32 s2, s2, 25 +; GFX11-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX11-NEXT: ; %bb.6: ; %Flow50 +; GFX11-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX11-NEXT: .LBB15_8: +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB15_10 +; GFX11-NEXT: ; %bb.9: ; %frem.else +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX11-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB15_11 +; GFX11-NEXT: s_branch .LBB15_16 +; GFX11-NEXT: .LBB15_10: +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: .LBB15_11: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX11-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX11-NEXT: s_add_i32 s2, s2, 25 +; GFX11-NEXT: .LBB15_13: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX11-NEXT: ; %bb.14: ; %Flow +; GFX11-NEXT: v_mov_b32_e32 v6, v8 +; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 +; GFX11-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX11-NEXT: .LBB15_16: ; %Flow49 +; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64_const_one_denum: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB15_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else16 +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX1150-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB15_3 +; GFX1150-NEXT: s_branch .LBB15_8 +; GFX1150-NEXT: .LBB15_2: +; GFX1150-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1150-NEXT: .LBB15_3: ; %frem.compute15 +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1150-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX1150-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX1150-NEXT: s_add_i32 s2, s2, 25 +; GFX1150-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX1150-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1150-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow50 +; GFX1150-NEXT: v_mov_b32_e32 v4, v6 +; GFX1150-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 +; GFX1150-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX1150-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1150-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX1150-NEXT: .LBB15_8: +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB15_10 +; GFX1150-NEXT: ; %bb.9: ; %frem.else +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX1150-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB15_11 +; GFX1150-NEXT: s_branch .LBB15_16 +; GFX1150-NEXT: .LBB15_10: +; GFX1150-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1150-NEXT: .LBB15_11: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1150-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX1150-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_add_i32 s2, s2, 25 +; GFX1150-NEXT: .LBB15_13: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX1150-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX1150-NEXT: ; %bb.14: ; %Flow +; GFX1150-NEXT: v_mov_b32_e32 v6, v8 +; GFX1150-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 +; GFX1150-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX1150-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX1150-NEXT: .LBB15_16: ; %Flow49 +; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX1150-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64_const_one_denum: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_cbranch_vccz .LBB15_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else16 +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX1200-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB15_3 +; GFX1200-NEXT: s_branch .LBB15_8 +; GFX1200-NEXT: .LBB15_2: +; GFX1200-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1200-NEXT: .LBB15_3: ; %frem.compute15 +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1200-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX1200-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 +; GFX1200-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[4:5] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow50 +; GFX1200-NEXT: v_mov_b32_e32 v4, v6 +; GFX1200-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 +; GFX1200-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[6:7] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1200-NEXT: v_add_f64_e32 v[6:7], 1.0, v[4:5] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX1200-NEXT: .LBB15_8: +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccz .LBB15_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX1200-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB15_11 +; GFX1200-NEXT: s_branch .LBB15_16 +; GFX1200-NEXT: .LBB15_10: +; GFX1200-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1200-NEXT: .LBB15_11: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1200-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX1200-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 +; GFX1200-NEXT: .LBB15_13: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[6:7] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: v_add_f64_e32 v[10:11], 1.0, v[6:7] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX1200-NEXT: ; %bb.14: ; %Flow +; GFX1200-NEXT: v_mov_b32_e32 v6, v8 +; GFX1200-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 +; GFX1200-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[6:7] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX1200-NEXT: .LBB15_16: ; %Flow49 +; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 +; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm + %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 + %r1 = frem <2 x double> %r0, <double 1.0, double 1.0> + store <2 x double> %r1, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 { +; SI-LABEL: frem_v2f64_const: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_v2f64_const: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_v2f64_const: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64_const: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64_const: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64_const: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1150-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mov_b32_e32 v2, v0 +; GFX1150-NEXT: v_mov_b32_e32 v3, v0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64_const: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1200-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm + %r0 = frem <2 x double> <double 1.0, double 1.0>, <double 2.0, double 1.0> + store <2 x double> %r0, ptr addrspace(1) %out, align 16 + ret void +} + + + attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } + + diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index b91963f..d23509b 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds ; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 ; GFX10CU-NEXT: s_waitcnt vmcnt(0) -; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10CU-NEXT: s_barrier ; GFX10CU-NEXT: ds_read_b32 v0, v0 ; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll index 5d03dfb..352af04 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll @@ -218,3 +218,174 @@ main_body: ret void } +define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_volatile: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b32 m0, s0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:256 lds +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_volatile: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b32 m0, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:256 +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_volatile: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 m0, s0 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:256 lds +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:256 +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0) + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: buffer_load_lds_dword_volatile: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: buffer_load_lds_dword_volatile: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_load_lds_dword_volatile: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +main_body: + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0) + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v0, s[0:1] glc slc lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v0, s[0:1] nt +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v0, s[0:1] slc lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: global_load_lds_dword v0, s[0:1] nt +; GFX942-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: buffer_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: buffer_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-GISEL-NEXT: s_endpgm +main_body: + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll index f0204bd..1dcd032 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll @@ -110,3 +110,43 @@ main_body: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0) ret void } + +define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword_volatile: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc lds +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:256 lds +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:512 lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0) + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword_nontemporal: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc slc lds +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0 + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index b5d741b..a979999 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p ; PREGFX10-LABEL: buffer_load_volatile: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc +; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: buffer_load_volatile: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: buffer_load_volatile: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index e8b8d05..e8eccb0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-OPT-NEXT: s_barrier -; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1 -; GFX8-OPT-NEXT: s_nop 1 -; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf -; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v4, vcc, v1, v1 +; GFX8-OPT-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-OPT-NEXT: s_barrier ; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2 ; GFX8-OPT-NEXT: s_endpgm ; @@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_barrier -; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1 -; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v1, v1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: s_barrier +; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; @@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_barrier -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v1, v1 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: s_barrier +; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index 97db15b..1d1d3e4 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 @@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen ; GFX12-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 9dac239..1e4b633 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-SDAG-NEXT: s_mov_b32 s5, s10 @@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-SDAG-NEXT: s_mov_b32 s3, s10 ; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 ; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 @@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 ; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 @@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] ; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 @@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX10-SDAG-NEXT: s_mov_b32 s3, s10 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] ; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 @@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 @@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] ; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 @@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 @@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] ; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 @@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll index 516c3946..282a7ae 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll @@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX10-CU-LABEL: test_s_barrier: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-CU-NEXT: s_barrier ; GFX10-CU-NEXT: s_endpgm ; @@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX11-CU-LABEL: test_s_barrier: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX11-CU-NEXT: s_barrier ; GFX11-CU-NEXT: s_endpgm ; @@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX12-CU-LABEL: test_s_barrier: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_alu 0xffe3 ; GFX12-CU-NEXT: s_barrier_signal -1 ; GFX12-CU-NEXT: s_barrier_wait -1 ; GFX12-CU-NEXT: s_endpgm @@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX10-CU-LABEL: test_s_barrier_workgroup_fence: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_barrier ; GFX10-CU-NEXT: s_endpgm ; @@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX11-CU-LABEL: test_s_barrier_workgroup_fence: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_barrier ; GFX11-CU-NEXT: s_endpgm ; @@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX12-CU-LABEL: test_s_barrier_workgroup_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xffe3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_barrier_signal -1 ; GFX12-CU-NEXT: s_barrier_wait -1 ; GFX12-CU-NEXT: s_endpgm @@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-CU-NEXT: s_barrier ; GFX10-CU-NEXT: s_endpgm ; @@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX11-CU-NEXT: s_barrier ; GFX11-CU-NEXT: s_endpgm ; @@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xffe3 ; GFX12-CU-NEXT: s_barrier_signal -1 ; GFX12-CU-NEXT: s_barrier_wait -1 ; GFX12-CU-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 6a76f43..7efbff9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX10-CU-LABEL: workgroup_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: @@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX11-CU-LABEL: workgroup_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_release_fence: @@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_release_fence: @@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: @@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: @@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_acq_rel_fence: @@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: @@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: @@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_seq_cst_fence: @@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: @@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_one_as_release_fence: @@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: @@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: @@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: @@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index d288bfc..1cca64a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX10-CU-LABEL: workgroup_release_fence: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: @@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX11-CU-LABEL: workgroup_release_fence: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_release_fence: @@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_release_fence: @@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: @@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: @@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_acq_rel_fence: @@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: @@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: @@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_seq_cst_fence: @@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: @@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_one_as_release_fence: @@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: @@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: @@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: @@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 4caaad6..9e2906c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2942,8 +2938,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2964,8 +2958,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3196,8 +3188,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3218,8 +3208,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9001,8 +8989,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9027,8 +9013,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9349,8 +9333,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9375,8 +9357,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9677,8 +9657,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9699,8 +9677,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10335,8 +10311,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10361,8 +10335,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10683,8 +10655,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10709,8 +10679,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11031,8 +10999,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11057,8 +11023,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11379,8 +11343,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11405,8 +11367,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12071,8 +12031,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12097,8 +12055,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12419,8 +12375,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12445,8 +12399,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12767,8 +12719,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12793,8 +12743,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13632,8 +13580,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13654,8 +13600,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15789,8 +15733,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15812,8 +15754,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16054,8 +15994,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16077,8 +16015,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -21829,8 +21765,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -21856,8 +21790,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22188,8 +22120,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22215,8 +22145,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22527,8 +22455,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22550,8 +22476,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23207,8 +23131,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23234,8 +23156,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23566,8 +23486,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23593,8 +23511,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23925,8 +23841,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23952,8 +23866,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -24284,8 +24196,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -24311,8 +24221,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -24998,8 +24906,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25025,8 +24931,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -25357,8 +25261,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25384,8 +25286,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -25716,8 +25616,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25743,8 +25641,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll index 9ea9f11..27283be 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll @@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2937,8 +2933,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2959,8 +2953,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3190,8 +3182,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3212,8 +3202,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8982,8 +8970,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9008,8 +8994,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9329,8 +9313,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9355,8 +9337,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9656,8 +9636,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9678,8 +9656,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10314,8 +10290,6 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10340,8 +10314,6 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10661,8 +10633,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10687,8 +10657,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11008,8 +10976,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11034,8 +11000,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11355,8 +11319,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11381,8 +11343,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12045,8 +12005,6 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12071,8 +12029,6 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12392,8 +12348,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12418,8 +12372,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12739,8 +12691,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12765,8 +12715,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13603,8 +13551,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13625,8 +13571,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15755,8 +15699,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15778,8 +15720,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16019,8 +15959,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16042,8 +15980,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -21781,8 +21717,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -21808,8 +21742,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22139,8 +22071,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22166,8 +22096,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22477,8 +22405,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22500,8 +22426,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23157,8 +23081,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23184,8 +23106,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23515,8 +23435,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23542,8 +23460,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23873,8 +23789,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23900,8 +23814,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -24231,8 +24143,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -24258,8 +24168,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -24943,8 +24851,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -24970,8 +24876,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -25301,8 +25205,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25328,8 +25230,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -25659,8 +25559,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25686,8 +25584,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 265c8c4..74065146 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -799,8 +799,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -820,8 +818,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2979,8 +2975,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3002,8 +2996,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3239,8 +3231,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3262,8 +3252,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9126,8 +9114,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9153,8 +9139,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9480,8 +9464,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9507,8 +9489,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9811,8 +9791,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9833,8 +9811,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10476,8 +10452,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10503,8 +10477,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10830,8 +10802,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10857,8 +10827,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11184,8 +11152,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11211,8 +11177,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11538,8 +11502,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11565,8 +11527,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12242,8 +12202,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12269,8 +12227,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12596,8 +12552,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12623,8 +12577,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12950,8 +12902,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12977,8 +12927,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13820,8 +13768,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13842,8 +13788,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16010,8 +15954,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16034,8 +15976,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16281,8 +16221,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16305,8 +16243,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22138,8 +22074,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22166,8 +22100,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22503,8 +22435,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22531,8 +22461,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -22845,8 +22773,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22868,8 +22794,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23532,8 +23456,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23560,8 +23482,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -23897,8 +23817,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -23925,8 +23843,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -24262,8 +24178,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -24290,8 +24204,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -24627,8 +24539,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -24655,8 +24565,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -25353,8 +25261,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25381,8 +25287,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -25718,8 +25622,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -25746,8 +25648,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -26083,8 +25983,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -26111,8 +26009,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index d277441..2afa577 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index e7f348e..d384aec 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -786,8 +786,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -802,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -1195,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1280,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1307,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -1374,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1459,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1486,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -1893,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1978,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -2005,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -2076,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -2172,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -2202,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -2275,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2371,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2401,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2699,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2815,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -2837,8 +2866,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2854,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -2939,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3055,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -3077,8 +3106,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3094,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -3737,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -3860,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3895,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -4013,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -4147,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -4185,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -4305,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4439,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4477,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -5143,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -5277,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -5315,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -5435,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5569,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5607,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5727,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5861,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5899,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -6019,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6153,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6191,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6929,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -7076,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -7119,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -7251,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -7405,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -7431,8 +7512,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7452,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -7585,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -7739,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -7765,8 +7846,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7786,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -8083,8 +8164,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -8547,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -8701,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -8727,8 +8806,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -8748,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -8881,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -9035,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -9061,8 +9140,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9082,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -9215,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -9369,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -9395,8 +9474,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9416,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -9549,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9703,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9729,8 +9810,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -9750,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9883,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -10037,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10082,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -10215,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -10369,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -10395,8 +10482,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10416,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -10549,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -10703,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -10729,8 +10816,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -10750,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -10883,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11037,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11063,8 +11150,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11084,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -11758,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11860,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11879,8 +11972,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11896,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -12286,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -12367,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12393,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -12458,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -12539,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12565,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -12961,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -13042,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -13068,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -13135,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -13222,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -13250,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -13318,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -13405,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -13433,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -13724,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -13833,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -13853,8 +14001,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13871,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -13953,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -14062,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -14082,8 +14239,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14100,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -14731,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -14850,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -14884,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -14998,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -15123,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -15159,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -15274,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -15399,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -15435,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -16078,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -16203,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -16239,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -16354,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -16479,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -16515,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -16630,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -16755,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -16791,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -16906,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -17031,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -17067,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -17182,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -17307,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -17343,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -17458,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -17583,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -17619,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -17734,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -17859,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -17895,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -18010,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -18135,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -18171,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -18902,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -19045,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -19087,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -19217,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -19364,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -19388,8 +19674,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -19410,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -19540,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -19687,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -19711,8 +20006,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -19733,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -20024,8 +20322,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -20483,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -20630,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -20654,8 +20956,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -20676,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -20806,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -20953,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -20977,8 +21288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -20999,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -21129,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -21276,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -21300,8 +21620,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -21322,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -21452,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -21599,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -21623,8 +21950,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -21645,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -21775,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -21922,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -21966,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -22096,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -22243,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -22267,8 +22610,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22289,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -22419,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -22566,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -22590,8 +22942,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22612,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; @@ -22742,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -22889,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -22913,8 +23274,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -22935,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 5c2d8eb..493f985 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -804,8 +804,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -822,8 +820,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2984,8 +2980,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3003,8 +2997,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3229,8 +3221,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3248,8 +3238,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8606,8 +8594,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8629,8 +8615,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8923,8 +8907,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8946,8 +8928,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9219,8 +9199,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9238,8 +9216,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9814,8 +9790,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9837,8 +9811,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10131,8 +10103,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10154,8 +10124,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10448,8 +10416,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10471,8 +10437,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10765,8 +10729,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10788,8 +10750,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11395,8 +11355,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11418,8 +11376,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11712,8 +11668,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11735,8 +11689,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12029,8 +11981,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12052,8 +12002,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12878,8 +12826,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12896,8 +12842,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15058,8 +15002,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15077,8 +15019,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15303,8 +15243,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15322,8 +15260,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20384,8 +20320,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20407,8 +20341,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20701,8 +20633,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20724,8 +20654,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20997,8 +20925,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21016,8 +20942,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21592,8 +21516,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21615,8 +21537,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21909,8 +21829,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21932,8 +21850,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22226,8 +22142,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22249,8 +22163,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22543,8 +22455,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22566,8 +22476,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23173,8 +23081,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23196,8 +23102,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23490,8 +23394,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23513,8 +23415,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23807,8 +23707,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23830,8 +23728,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll index 7c0a2ad..40257df 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll @@ -804,8 +804,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -822,8 +820,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2979,8 +2975,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2998,8 +2992,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3223,8 +3215,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3242,8 +3232,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8587,8 +8575,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8610,8 +8596,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8903,8 +8887,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8926,8 +8908,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9198,8 +9178,6 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9217,8 +9195,6 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9793,8 +9769,6 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9816,8 +9790,6 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10109,8 +10081,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10132,8 +10102,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10425,8 +10393,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10448,8 +10414,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10741,8 +10705,6 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10764,8 +10726,6 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11369,8 +11329,6 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11392,8 +11350,6 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11685,8 +11641,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11708,8 +11662,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12001,8 +11953,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12024,8 +11974,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12849,8 +12797,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12867,8 +12813,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15024,8 +14968,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15043,8 +14985,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15268,8 +15208,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15287,8 +15225,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20337,8 +20273,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20360,8 +20294,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20653,8 +20585,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20676,8 +20606,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20948,8 +20876,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20967,8 +20893,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21543,8 +21467,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21566,8 +21488,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21859,8 +21779,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21882,8 +21800,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22175,8 +22091,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22198,8 +22112,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22491,8 +22403,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22514,8 +22424,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23119,8 +23027,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23142,8 +23048,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23435,8 +23339,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23458,8 +23360,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23751,8 +23651,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23774,8 +23672,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index e7880a8..ee5a8bf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -808,8 +808,6 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -826,8 +824,6 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3021,8 +3017,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3041,8 +3035,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3272,8 +3264,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3292,8 +3282,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7235,8 +7223,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7259,8 +7245,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7558,8 +7542,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7582,8 +7564,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7857,8 +7837,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7876,8 +7854,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8459,8 +8435,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8483,8 +8457,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8782,8 +8754,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8806,8 +8776,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9105,8 +9073,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9129,8 +9095,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9428,8 +9392,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9452,8 +9414,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10070,8 +10030,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10094,8 +10052,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10393,8 +10349,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10417,8 +10371,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10716,8 +10668,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10740,8 +10690,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11570,8 +11518,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11588,8 +11534,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13783,8 +13727,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13803,8 +13745,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14034,8 +13974,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14054,8 +13992,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19493,8 +19429,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19517,8 +19451,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19816,8 +19748,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19840,8 +19770,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20115,8 +20043,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20134,8 +20060,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20717,8 +20641,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20741,8 +20663,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21040,8 +20960,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21064,8 +20982,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21363,8 +21279,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21387,8 +21301,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21686,8 +21598,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21710,8 +21620,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22328,8 +22236,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22352,8 +22258,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22651,8 +22555,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22675,8 +22577,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22974,8 +22874,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22998,8 +22896,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 3bf5ed8..c326edf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 885edec..868b438 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -780,8 +782,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -792,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1206,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1292,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1317,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -1393,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1479,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1504,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -1920,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -2005,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -2030,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -2107,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2198,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2225,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2303,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2394,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2421,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2707,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -2809,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2827,8 +2863,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2841,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2930,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -3032,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3050,8 +3089,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3064,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3650,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3764,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3797,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -3906,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -4026,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -4061,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -4171,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4291,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4326,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4926,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -5046,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -5081,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -5191,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -5311,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -5346,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -5456,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5576,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5611,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5721,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -5841,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -5876,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -5986,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -6106,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -6141,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -6251,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -6371,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -6406,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -6516,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6636,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6671,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6781,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6901,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6936,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -7594,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -7723,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7760,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7883,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8015,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8037,8 +8163,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8055,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8178,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8310,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8332,8 +8461,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8350,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8610,8 +8740,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9021,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9153,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9175,8 +9305,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9193,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9316,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9448,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9470,8 +9603,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9488,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9611,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9743,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9765,8 +9901,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9783,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9906,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -10038,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10060,8 +10199,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10078,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10201,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -10333,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10371,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10494,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -10626,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10648,8 +10795,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10666,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10789,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -10921,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10943,8 +11093,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10961,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11084,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -11216,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11238,8 +11391,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11256,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11940,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -12035,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12052,8 +12208,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12064,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12475,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -12557,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -12581,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -12654,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -12736,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -12760,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -13173,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -13254,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -13278,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -13352,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -13439,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -13465,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -13540,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -13627,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -13653,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -13936,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -14034,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14051,8 +14259,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14065,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14151,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -14249,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14266,8 +14480,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14280,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14863,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -14973,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -15005,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -15111,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -15227,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -15261,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -15368,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -15484,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -15518,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -16115,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -16231,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -16265,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -16372,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -16488,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -16522,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -16629,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -16745,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -16779,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -16886,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -17002,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -17036,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -17143,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -17259,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -17293,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -17400,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -17516,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -17550,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -17657,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -17773,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -17807,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -17914,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -18030,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -18064,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -18719,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18844,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18880,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19000,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19128,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19149,8 +19489,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19167,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19287,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19415,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19436,8 +19782,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19454,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19713,8 +20061,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20122,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20250,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20271,8 +20621,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20289,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20409,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20537,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20558,8 +20914,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20576,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20696,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20824,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20845,8 +21207,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20863,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20983,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -21111,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21132,8 +21500,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21150,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21270,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -21398,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21435,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21555,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -21683,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21704,8 +22086,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21722,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21842,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -21970,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21991,8 +22379,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22009,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22129,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -22257,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22278,8 +22672,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22296,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll new file mode 100644 index 0000000..0b40c81 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll @@ -0,0 +1,542 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s + +define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off slc lds +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p1_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p1_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p1_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off slc lds +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p7_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p7_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p7_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 m0, s4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 m0, s4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 m0, s4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0) + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 986b48b..712109d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll index 8926893..6d1e4e6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll @@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 81bbe0a..577d2ca 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 980141a..d686e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 6a233a2..ab4d783 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; @@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; @@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm @@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm @@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir new file mode 100644 index 0000000..93f7bcc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s + +# Ensure WMMA operations stay before the final atomic fence and barrier group. +# This allows the latency of the WMMA operations to be hidden by barrier wait. +--- +name: test +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 + + ; CHECK-LABEL: name: test + ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ATOMIC_FENCE 5, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: ATOMIC_FENCE 4, 2 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 { + ; CHECK-NEXT: $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: } + ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc + ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec + ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec + ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc + ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1 + ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec + ; CHECK-NEXT: ATOMIC_FENCE 5, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: ATOMIC_FENCE 4, 2 + ATOMIC_FENCE 5, 2 + S_BARRIER + ATOMIC_FENCE 4, 2 + BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 { + $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3) + $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3) + } + $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc + $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec + $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec + S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc + early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec + early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec + early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec + early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec + early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec + early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec + early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec + early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 killed $sgpr1 + ATOMIC_FENCE 5, 2 + S_BARRIER + ATOMIC_FENCE 4, 2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index f3cb5a7..30f5277 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) { ; GFX9-LABEL: barrier_vmcnt_global: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v1, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: global_load_dword v3, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX9-NEXT: flat_load_dword v3, v[2:3] ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index e3437fd..a42c8ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -767,6 +767,136 @@ define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) { ret void } +define amdgpu_gfx_whole_wave void @realign_stack(i1 %active, i32 %x) { +; DAGISEL-LABEL: realign_stack: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_mov_b32 s1, s33 +; DAGISEL-NEXT: s_add_co_i32 s33, s32, 0x3ff +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_mov_b32 s2, s34 +; DAGISEL-NEXT: s_mov_b32 s34, s32 +; DAGISEL-NEXT: s_addk_co_i32 s32, 0x800 +; DAGISEL-NEXT: s_wait_storecnt 0x0 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; DAGISEL-NEXT: s_wait_storecnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_mov_b32 s32, s34 +; DAGISEL-NEXT: s_mov_b32 s34, s2 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_mov_b32 s33, s1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: realign_stack: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_mov_b32 s1, s33 +; GISEL-NEXT: s_add_co_i32 s33, s32, 0x3ff +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: s_mov_b32 s2, s34 +; GISEL-NEXT: s_mov_b32 s34, s32 +; GISEL-NEXT: s_addk_co_i32 s32, 0x800 +; GISEL-NEXT: s_wait_storecnt 0x0 +; GISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; GISEL-NEXT: s_wait_storecnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_mov_b32 s32, s34 +; GISEL-NEXT: s_mov_b32 s34, s2 +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_mov_b32 s33, s1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: realign_stack: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_mov_b32 s2, s33 +; DAGISEL64-NEXT: s_add_co_i32 s33, s32, 0x3ff +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: s_mov_b32 s3, s34 +; DAGISEL64-NEXT: s_mov_b32 s34, s32 +; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x800 +; DAGISEL64-NEXT: s_wait_storecnt 0x0 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; DAGISEL64-NEXT: s_wait_storecnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_mov_b32 s32, s34 +; DAGISEL64-NEXT: s_mov_b32 s34, s3 +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_mov_b32 s33, s2 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: realign_stack: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_mov_b32 s2, s33 +; GISEL64-NEXT: s_add_co_i32 s33, s32, 0x3ff +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: s_mov_b32 s3, s34 +; GISEL64-NEXT: s_mov_b32 s34, s32 +; GISEL64-NEXT: s_addk_co_i32 s32, 0x800 +; GISEL64-NEXT: s_wait_storecnt 0x0 +; GISEL64-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; GISEL64-NEXT: s_wait_storecnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_mov_b32 s32, s34 +; GISEL64-NEXT: s_mov_b32 s34, s3 +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_mov_b32 s33, s2 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-DAGISEL-LABEL: realign_stack: +; GFX1250-DAGISEL: ; %bb.0: +; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s1, s33 +; GFX1250-DAGISEL-NEXT: s_add_co_i32 s33, s32, 0x3ff +; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-DAGISEL-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s2, s34 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s34, s32 +; GFX1250-DAGISEL-NEXT: s_addk_co_i32 s32, 0x800 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; GFX1250-DAGISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s32, s34 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s34, s2 +; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s1 +; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] + %fussy = alloca i32, align 1024, addrspace(5) + store volatile i32 %x, ptr addrspace(5) %fussy, align 1024 + ret void +} + define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; DAGISEL-LABEL: multiple_blocks: ; DAGISEL: ; %bb.0: |