aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll1477
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll1477
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll1912
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll1912
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll605
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll605
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll638
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll271
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll638
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll271
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll7
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll7
16 files changed, 2762 insertions, 7250 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 00d418c..a05c05f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -20,28 +20,10 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32:
@@ -72,55 +54,21 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32:
@@ -198,25 +146,9 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -230,28 +162,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -282,56 +196,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -410,26 +291,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -444,28 +310,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -503,61 +351,25 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -642,26 +454,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -676,27 +473,10 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32:
@@ -726,53 +506,23 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32:
@@ -847,24 +597,9 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -878,27 +613,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -928,28 +646,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -957,26 +659,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1055,24 +743,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -1087,27 +760,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1143,32 +799,14 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1176,26 +814,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1282,24 +906,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -1745,28 +1354,10 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
@@ -1797,55 +1388,21 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
@@ -1923,25 +1480,9 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -1955,28 +1496,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2007,56 +1530,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2135,26 +1625,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -2169,28 +1644,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2228,61 +1685,25 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2367,26 +1788,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -2401,27 +1807,10 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
@@ -2451,53 +1840,23 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
@@ -2572,24 +1931,9 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -2603,27 +1947,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2653,28 +1980,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2682,26 +1993,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2780,24 +2077,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -2812,27 +2094,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -2868,32 +2133,14 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -2901,26 +2148,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -3007,24 +2240,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -3497,27 +2715,10 @@ define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64:
@@ -3551,54 +2752,19 @@ define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64:
@@ -3659,30 +2825,9 @@ define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst
ret double %result
@@ -3723,27 +2868,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -3777,54 +2905,21 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -3885,30 +2980,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
@@ -3950,33 +3026,13 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4015,56 +3071,23 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4129,30 +3152,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
@@ -4193,25 +3197,10 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64:
@@ -4244,50 +3233,20 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64:
@@ -4344,28 +3303,9 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst
ret void
@@ -4405,25 +3345,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -4458,50 +3383,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -4560,30 +3455,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
@@ -4624,31 +3500,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -4687,54 +3545,22 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -4798,30 +3624,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index fdfbb42..216ea07 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -20,28 +20,10 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32:
@@ -72,55 +54,21 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32:
@@ -198,25 +146,9 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -230,28 +162,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -282,56 +196,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -410,26 +291,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -444,28 +310,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -503,61 +351,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -642,26 +454,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -676,27 +473,10 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32:
@@ -726,53 +506,23 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32:
@@ -847,24 +597,9 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -878,27 +613,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -928,28 +646,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -957,26 +659,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1055,24 +743,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -1087,27 +760,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1143,32 +799,14 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1176,26 +814,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1282,24 +906,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -1745,28 +1354,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
@@ -1797,55 +1388,21 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
@@ -1923,25 +1480,9 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -1955,28 +1496,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2007,56 +1530,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2135,26 +1625,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2169,28 +1644,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2228,61 +1685,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2367,26 +1788,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2401,27 +1807,10 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
@@ -2451,53 +1840,23 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
@@ -2572,24 +1931,9 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -2603,27 +1947,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2653,28 +1980,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2682,26 +1993,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2780,24 +2077,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2812,27 +2094,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2868,32 +2133,14 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2901,26 +2148,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3007,24 +2240,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -3497,27 +2715,10 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64:
@@ -3551,54 +2752,19 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64:
@@ -3659,30 +2825,9 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
ret double %result
@@ -3723,27 +2868,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3777,54 +2905,21 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3885,30 +2980,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -3950,33 +3026,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4015,56 +3071,23 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4129,30 +3152,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -4193,25 +3197,10 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64:
@@ -4244,50 +3233,20 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64:
@@ -4344,28 +3303,9 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
ret void
@@ -4405,25 +3345,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -4458,50 +3383,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -4560,30 +3455,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -4624,31 +3500,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -4687,54 +3545,22 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -4798,30 +3624,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 492c74e..ae5dca4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -21,28 +21,10 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -73,55 +55,21 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -203,27 +151,10 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -233,28 +164,10 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -269,28 +182,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -321,55 +216,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -452,27 +313,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -482,28 +326,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -519,28 +345,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -571,55 +379,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -699,71 +473,26 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB2_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -779,27 +508,10 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -829,53 +541,21 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -954,26 +634,9 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -983,27 +646,9 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB3_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -1018,27 +663,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1068,53 +696,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1195,26 +791,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1224,27 +803,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB4_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1260,27 +821,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1310,53 +854,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1434,67 +946,24 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2010,28 +1479,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -2062,55 +1513,21 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -2192,27 +1609,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -2222,28 +1622,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -2258,28 +1640,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2310,55 +1674,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2441,27 +1771,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2471,28 +1784,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB9_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -2508,28 +1803,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2560,55 +1837,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2688,71 +1931,26 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB10_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2768,27 +1966,10 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2818,53 +1999,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2943,26 +2092,9 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2972,27 +2104,9 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -3007,27 +2121,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -3057,53 +2154,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -3184,26 +2249,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -3213,27 +2261,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB12_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -3249,27 +2279,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -3299,53 +2312,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -3423,67 +2404,24 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB13_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -4026,27 +2964,10 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_f64:
@@ -4080,54 +3001,19 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64:
@@ -4186,69 +3072,28 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -4290,27 +3135,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -4344,54 +3172,19 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -4450,69 +3243,28 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -4555,27 +3307,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4609,54 +3344,19 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4715,77 +3415,28 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
@@ -4827,25 +3478,10 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -4878,50 +3514,19 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmax_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -4979,29 +3584,9 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -5011,30 +3596,9 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB19_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -5075,25 +3639,10 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -5126,50 +3675,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -5229,29 +3747,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -5261,30 +3759,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB20_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -5326,25 +3803,10 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -5377,50 +3839,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -5477,73 +3908,24 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index a4adb08..915ce74 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -21,28 +21,10 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -73,55 +55,21 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -203,27 +151,10 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -233,28 +164,10 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -269,28 +182,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -321,55 +216,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -452,27 +313,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -482,28 +326,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -519,28 +345,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -571,55 +379,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -699,71 +473,26 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB2_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -779,27 +508,10 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -829,53 +541,21 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -954,26 +634,9 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -983,27 +646,9 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB3_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -1018,27 +663,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1068,53 +696,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1195,26 +791,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1224,27 +803,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB4_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1260,27 +821,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1310,53 +854,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1434,67 +946,24 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2010,28 +1479,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -2062,55 +1513,21 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -2192,27 +1609,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -2222,28 +1622,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -2258,28 +1640,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2310,55 +1674,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2441,27 +1771,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2471,28 +1784,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB9_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -2508,28 +1803,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2560,55 +1837,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2688,71 +1931,26 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB10_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2768,27 +1966,10 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2818,53 +1999,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2943,26 +2092,9 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2972,27 +2104,9 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -3007,27 +2121,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -3057,53 +2154,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -3184,26 +2249,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -3213,27 +2261,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB12_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -3249,27 +2279,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3299,53 +2312,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3423,67 +2404,24 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB13_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -4026,27 +2964,10 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_f64:
@@ -4080,54 +3001,19 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64:
@@ -4186,69 +3072,28 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -4290,27 +3135,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -4344,54 +3172,19 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -4450,69 +3243,28 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -4555,27 +3307,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4609,54 +3344,19 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4715,77 +3415,28 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
@@ -4827,25 +3478,10 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -4878,50 +3514,19 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmin_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -4979,29 +3584,9 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -5011,30 +3596,9 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB19_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -5075,25 +3639,10 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -5126,50 +3675,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -5229,29 +3747,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -5261,30 +3759,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB20_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -5326,25 +3803,10 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -5377,50 +3839,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -5477,73 +3908,24 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6548792..7f052e1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -84,55 +84,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-NEXT: .LBB0_3:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-NEXT: .LBB0_3:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -142,60 +116,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-NEXT: .LBB0_3:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB0_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-NEXT: .LBB0_3:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB0_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -233,55 +180,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-DPP-NEXT: .LBB0_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB0_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-DPP-NEXT: .LBB0_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB0_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -291,60 +212,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-DPP-NEXT: .LBB0_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB0_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-DPP-NEXT: .LBB0_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB0_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
ret void
@@ -501,18 +395,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -520,27 +414,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1064-NEXT: .LBB1_5:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB1_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -571,45 +451,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1032-NEXT: .LBB1_5:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB1_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -630,13 +496,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -644,7 +510,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,29 +519,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB1_5
+; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1164-NEXT: .LBB1_5:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -696,13 +546,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -710,36 +560,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB1_5
+; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1132-NEXT: .LBB1_5:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -904,27 +739,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1064-DPP-NEXT: .LBB1_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB1_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -986,29 +807,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1032-DPP-NEXT: .LBB1_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB1_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -1076,34 +883,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1164-DPP-NEXT: .LBB1_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -1159,34 +950,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1132-DPP-NEXT: .LBB1_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3626,59 +3401,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT: .LBB6_3:
+; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT: .LBB6_3:
+; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
@@ -3783,59 +3530,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT: .LBB6_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB6_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT: .LBB6_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB6_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
@@ -4039,23 +3758,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4087,23 +3790,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4261,23 +3948,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4309,23 +3980,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 6936cdc..a9f49ad 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -84,55 +84,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-NEXT: .LBB0_3:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-NEXT: .LBB0_3:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -142,60 +116,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-NEXT: .LBB0_3:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB0_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-NEXT: .LBB0_3:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB0_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -233,55 +180,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-DPP-NEXT: .LBB0_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB0_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-DPP-NEXT: .LBB0_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB0_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -291,60 +212,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-DPP-NEXT: .LBB0_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB0_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-DPP-NEXT: .LBB0_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB0_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
ret void
@@ -501,18 +395,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -520,27 +414,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1064-NEXT: .LBB1_5:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB1_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -571,45 +451,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1032-NEXT: .LBB1_5:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB1_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -630,13 +496,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -644,7 +510,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,29 +519,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB1_5
+; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1164-NEXT: .LBB1_5:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -696,13 +546,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -710,36 +560,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB1_5
+; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1132-NEXT: .LBB1_5:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -904,27 +739,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1064-DPP-NEXT: .LBB1_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB1_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -986,29 +807,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1032-DPP-NEXT: .LBB1_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB1_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -1076,34 +883,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1164-DPP-NEXT: .LBB1_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -1159,34 +950,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1132-DPP-NEXT: .LBB1_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3626,59 +3401,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT: .LBB6_3:
+; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT: .LBB6_3:
+; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
@@ -3783,59 +3530,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT: .LBB6_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB6_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT: .LBB6_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB6_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
@@ -4039,23 +3758,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4087,23 +3790,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4261,23 +3948,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4309,23 +3980,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
new file mode 100644
index 0000000..1fb5d53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen slc
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4]
+; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_and_saveexec_b32 s5, s5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; Test waterfall loop on soffset
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr7
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr7
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, v7
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_b32 s4, s4, s5
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr7
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB9_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: v_readfirstlane_b32 s3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: v_readfirstlane_b32 s3, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll
new file mode 100644
index 0000000..b859147
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9, except 90a/940
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret double %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+; Test waterfall loop on soffset
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr8
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr8
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
new file mode 100644
index 0000000..87055db
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen slc
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4]
+; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_and_saveexec_b32 s5, s5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; Test waterfall loop on soffset
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr7
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr7
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, v7
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_b32 s4, s4, s5
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr7
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB9_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: v_readfirstlane_b32 s3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: v_readfirstlane_b32 s3, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll
new file mode 100644
index 0000000..5c23a86
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9, except 90a/940
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret double %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+; Test waterfall loop on soffset
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr8
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr8
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 31da626..7290a91 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -3350,17 +3350,17 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
ret float %res
@@ -3372,17 +3372,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3394,17 +3394,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3416,17 +3416,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
@@ -3530,17 +3530,17 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
ret float %res
@@ -3552,17 +3552,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3574,17 +3574,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3596,17 +3596,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 35c5463..05fb224 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -3350,17 +3350,17 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst
ret float %res
@@ -3372,17 +3372,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3394,17 +3394,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3416,17 +3416,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
@@ -3530,17 +3530,17 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
ret float %res
@@ -3552,17 +3552,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3574,17 +3574,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3596,17 +3596,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index a5830bd..af6b7e0 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -1246,17 +1246,17 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
ret double %res
@@ -1268,17 +1268,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1290,17 +1290,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1312,17 +1312,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
@@ -1462,17 +1462,17 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
ret double %res
@@ -1484,17 +1484,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1506,17 +1506,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1528,17 +1528,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index 4489b63..69d65e6 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -1246,17 +1246,17 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
@@ -1268,17 +1268,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1290,17 +1290,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1312,17 +1312,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
@@ -1462,17 +1462,17 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
@@ -1484,17 +1484,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1506,17 +1506,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1528,17 +1528,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 387bec7..2a5e1bd 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX9 %s
define float @test_atomicrmw_fmax_f32_flat(ptr %ptr, float %value) {
; GCN-LABEL: @test_atomicrmw_fmax_f32_flat(
@@ -257,3 +257,6 @@ define double @test_atomicrmw_fmax_f64_global_strictfp(ptr addrspace(1) %ptr, do
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX7: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index e7c8faa..0fa409b 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX9 %s
define float @test_atomicrmw_fmin_f32_flat(ptr %ptr, float %value) {
; GCN-LABEL: @test_atomicrmw_fmin_f32_flat(
@@ -257,3 +257,6 @@ define double @test_atomicrmw_fmin_f64_global_strictfp(ptr addrspace(1) %ptr, do
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX7: {{.*}}
+; GFX9: {{.*}}