aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll1477
1 files changed, 142 insertions, 1335 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index fdfbb42..216ea07 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -20,28 +20,10 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32:
@@ -72,55 +54,21 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32:
@@ -198,25 +146,9 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -230,28 +162,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -282,56 +196,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -410,26 +291,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -444,28 +310,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -503,61 +351,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -642,26 +454,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -676,27 +473,10 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32:
@@ -726,53 +506,23 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32:
@@ -847,24 +597,9 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -878,27 +613,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -928,28 +646,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -957,26 +659,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1055,24 +743,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -1087,27 +760,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1143,32 +799,14 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1176,26 +814,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1282,24 +906,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -1745,28 +1354,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
@@ -1797,55 +1388,21 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
@@ -1923,25 +1480,9 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -1955,28 +1496,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2007,56 +1530,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2135,26 +1625,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2169,28 +1644,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2228,61 +1685,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2367,26 +1788,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2401,27 +1807,10 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
@@ -2451,53 +1840,23 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
@@ -2572,24 +1931,9 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -2603,27 +1947,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2653,28 +1980,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2682,26 +1993,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2780,24 +2077,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2812,27 +2094,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2868,32 +2133,14 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2901,26 +2148,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3007,24 +2240,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -3497,27 +2715,10 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64:
@@ -3551,54 +2752,19 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64:
@@ -3659,30 +2825,9 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
ret double %result
@@ -3723,27 +2868,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3777,54 +2905,21 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3885,30 +2980,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -3950,33 +3026,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4015,56 +3071,23 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4129,30 +3152,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -4193,25 +3197,10 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64:
@@ -4244,50 +3233,20 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64:
@@ -4344,28 +3303,9 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
ret void
@@ -4405,25 +3345,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -4458,50 +3383,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -4560,30 +3455,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -4624,31 +3500,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -4687,54 +3545,22 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -4798,30 +3624,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst