From f433c3b38005701fdc219ae8c01e6af1b8bedba9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 20 Apr 2024 00:43:36 +0200 Subject: AMDGPU: Add tests for atomicrmw handling of new metadata (#89248) Add baseline tests which should comprehensively test the new atomic metadata. Test codegen / expansion, and preservation in a few transforms. New metadata defined in #85052 --- llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll | 76 + .../test/CodeGen/AMDGPU/flat_atomics_i32_system.ll | 2161 +++++++++--- .../test/CodeGen/AMDGPU/flat_atomics_i64_system.ll | 2268 +++++++++--- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 640 ++++ .../CodeGen/AMDGPU/global_atomics_i32_system.ll | 2381 ++++++++++--- .../CodeGen/AMDGPU/global_atomics_i64_system.ll | 2328 ++++++++++-- .../CodeGen/AMDGPU/global_atomics_scan_fadd.ll | 741 ++++ .../CodeGen/AMDGPU/global_atomics_scan_fmax.ll | 672 ++++ .../CodeGen/AMDGPU/global_atomics_scan_fmin.ll | 671 ++++ llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 710 ++++ .../AMDGPU/expand-atomic-f32-system.ll | 3717 ++++++++++++++++++++ .../AMDGPU/expand-atomic-f64-system.ll | 1685 +++++++++ .../AMDGPU/expand-atomic-i32-system.ll | 828 +++++ .../AMDGPU/expand-atomic-i64-system.ll | 828 +++++ .../AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll | 2209 +++++++++++- .../AMDGPU/expand-atomic-v2bf16-system.ll | 859 +++++ .../AMDGPU/expand-atomic-v2f16-system.ll | 859 +++++ .../expand-atomicrmw-integer-ops-0-to-add-0.ll | 10 + .../Transforms/InferAddressSpaces/AMDGPU/basic.ll | 10 + .../Inline/AMDGPU/inline-atomicrmw-md-preserve.ll | 30 + 20 files changed, 22034 insertions(+), 1649 deletions(-) create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll create mode 100644 llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll index ebd6f18..376fe79f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll @@ -35,3 +35,79 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x } declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>) + +define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { + ; GFX940-LABEL: name: flat_agent_atomic_fadd_ret_v2f16 + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: successors: %bb.1(0x80000000) + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) + ; GFX940-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.1.atomicrmw.start: + ; GFX940-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 + ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 + ; GFX940-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr) + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec + ; GFX940-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc + ; GFX940-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.2.atomicrmw.end: + ; GFX940-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.1 + ; GFX940-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 + ; GFX940-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: $vgpr0 = COPY [[PHI2]] + ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { + ; GFX940-LABEL: name: flat_agent_atomic_fadd_noret_v2f16 + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: successors: %bb.1(0x80000000) + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) + ; GFX940-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.1.atomicrmw.start: + ; GFX940-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 + ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 + ; GFX940-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr) + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec + ; GFX940-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc + ; GFX940-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.2.atomicrmw.end: + ; GFX940-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 + ; GFX940-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: SI_RETURN + %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 9495651..9612734 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -287,6 +287,72 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3 ret i32 %result } +define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f32 ; --------------------------------------------------------------------- @@ -571,6 +637,72 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, ret float %result } +define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, float %in) { +; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, float %in) { +; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -855,6 +987,72 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } +define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1139,6 +1337,72 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } +define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1423,61 +1687,127 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_nand_i32_noret: +define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 -; GCN1-NEXT: v_not_b32_e32 v3, v3 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB40_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i32_noret: +; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 -; GCN2-NEXT: v_not_b32_e32 v3, v3 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB40_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i32_noret: +; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_nand_i32_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i32_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1489,7 +1819,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB40_1 +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1505,7 +1835,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1517,7 +1847,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB41_1 +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1529,7 +1859,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1541,7 +1871,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB41_1 +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1551,7 +1881,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1563,7 +1893,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB41_1 +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1578,7 +1908,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -1590,7 +1920,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_1 +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -1601,7 +1931,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -1613,7 +1943,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_1 +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -1624,7 +1954,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1636,7 +1966,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB42_1 +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -1653,7 +1983,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -1665,7 +1995,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB43_1 +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1677,7 +2007,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -1689,7 +2019,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB43_1 +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1699,7 +2029,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1711,7 +2041,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB43_1 +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -1729,7 +2059,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1741,7 +2071,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB44_1 +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1753,7 +2083,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1765,7 +2095,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB44_1 +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1777,7 +2107,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1789,7 +2119,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB44_1 +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1807,7 +2137,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1819,7 +2149,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB45_1 +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1833,7 +2163,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1845,7 +2175,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB45_1 +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1857,7 +2187,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1869,7 +2199,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB45_1 +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1888,7 +2218,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -1900,7 +2230,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB46_1 +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1914,7 +2244,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -1926,7 +2256,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB46_1 +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1940,7 +2270,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -1952,7 +2282,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB46_1 +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1970,7 +2300,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -1982,7 +2312,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB47_1 +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1996,7 +2326,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -2008,7 +2338,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB47_1 +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2022,7 +2352,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -2034,7 +2364,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB47_1 +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2043,6 +2373,157 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ret i32 %result } +define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2327,84 +2808,150 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_xor_i32_noret: +define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i32_noret: +; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_xor_i32_noret: +; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { -; GCN1-LABEL: flat_atomic_xor_i32_noret_offset: +define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 - %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst - ret void + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } -define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_xor_i32_ret: +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i32_ret: +; GCN2-LABEL: flat_atomic_xor_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i32_noret: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst + ret void +} + +define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i32 4 + %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst + ret void +} + +define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_ret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i32_ret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2611,6 +3158,72 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } +define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -2621,7 +3234,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2632,7 +3245,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB64_1 +; GCN1-NEXT: s_cbranch_execnz .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2642,7 +3255,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2653,7 +3266,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB64_1 +; GCN2-NEXT: s_cbranch_execnz .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2663,7 +3276,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2674,7 +3287,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB64_1 +; GCN3-NEXT: s_cbranch_execnz .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2690,7 +3303,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2701,7 +3314,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: s_cbranch_execnz .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2713,7 +3326,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2724,7 +3337,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: s_cbranch_execnz .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2734,7 +3347,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2745,7 +3358,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB65_1 +; GCN3-NEXT: s_cbranch_execnz .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2760,7 +3373,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -2771,7 +3384,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: s_cbranch_execnz .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -2782,7 +3395,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -2793,7 +3406,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: s_cbranch_execnz .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -2804,7 +3417,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -2815,7 +3428,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB66_1 +; GCN3-NEXT: s_cbranch_execnz .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -2832,7 +3445,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -2843,7 +3456,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: s_cbranch_execnz .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2855,7 +3468,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -2866,7 +3479,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: s_cbranch_execnz .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2876,7 +3489,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -2887,7 +3500,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: s_cbranch_execnz .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -2905,7 +3518,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2916,7 +3529,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB68_1 +; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2928,7 +3541,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2939,7 +3552,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB68_1 +; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2951,7 +3564,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2962,7 +3575,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB68_1 +; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2980,7 +3593,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2991,7 +3604,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB69_1 +; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3005,7 +3618,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3016,7 +3629,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB69_1 +; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3028,7 +3641,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3039,7 +3652,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB69_1 +; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3058,7 +3671,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -3069,7 +3682,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB70_1 +; GCN1-NEXT: s_cbranch_execnz .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3083,7 +3696,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -3094,7 +3707,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB70_1 +; GCN2-NEXT: s_cbranch_execnz .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3108,7 +3721,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -3119,7 +3732,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB70_1 +; GCN3-NEXT: s_cbranch_execnz .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3137,7 +3750,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -3148,7 +3761,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB71_1 +; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3162,7 +3775,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -3173,7 +3786,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB71_1 +; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3187,7 +3800,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -3198,7 +3811,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB71_1 +; GCN3-NEXT: s_cbranch_execnz .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3223,7 +3836,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3234,7 +3847,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB72_1 +; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3253,7 +3866,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3264,7 +3877,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB72_1 +; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3281,7 +3894,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3292,7 +3905,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB72_1 +; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3319,7 +3932,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -3330,7 +3943,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB73_1 +; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3354,7 +3967,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -3365,7 +3978,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB73_1 +; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -3387,7 +4000,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -3398,7 +4011,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB73_1 +; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3427,7 +4040,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3438,7 +4051,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB74_1 +; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3455,7 +4068,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3466,7 +4079,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB74_1 +; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3483,7 +4096,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3494,7 +4107,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB74_1 +; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3518,7 +4131,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -3529,7 +4142,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB75_1 +; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3551,7 +4164,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -3562,7 +4175,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB75_1 +; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -3584,7 +4197,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -3595,7 +4208,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB75_1 +; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3609,20 +4222,18 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_umax_i32_noret: +define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3630,20 +4241,22 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB76_1 +; GCN1-NEXT: s_cbranch_execnz .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_umax_i32_noret: +; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3651,91 +4264,236 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB76_1 +; GCN2-NEXT: s_cbranch_execnz .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_umax_i32_noret: +; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB76_1 +; GCN3-NEXT: s_cbranch_execnz .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { -; GCN1-LABEL: flat_atomic_umax_i32_noret_offset: +define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_max_i32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB77_1 +; GCN1-NEXT: s_cbranch_execnz .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_max_i32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB77_1 +; GCN2-NEXT: s_cbranch_execnz .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_umax_i32_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i32_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i32_noret: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst + ret void +} + +define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3743,7 +4501,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB77_1 +; GCN3-NEXT: s_cbranch_execnz .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3758,7 +4516,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -3769,7 +4527,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB78_1 +; GCN1-NEXT: s_cbranch_execnz .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -3780,7 +4538,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -3791,7 +4549,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB78_1 +; GCN2-NEXT: s_cbranch_execnz .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -3802,7 +4560,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -3813,7 +4571,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB78_1 +; GCN3-NEXT: s_cbranch_execnz .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -3830,7 +4588,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -3841,7 +4599,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB79_1 +; GCN1-NEXT: s_cbranch_execnz .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3853,7 +4611,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -3864,7 +4622,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB79_1 +; GCN2-NEXT: s_cbranch_execnz .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3874,7 +4632,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -3885,7 +4643,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB79_1 +; GCN3-NEXT: s_cbranch_execnz .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -3903,7 +4661,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3914,7 +4672,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3926,7 +4684,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3937,7 +4695,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3949,7 +4707,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3960,7 +4718,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3978,7 +4736,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3989,7 +4747,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4003,7 +4761,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4014,7 +4772,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4026,7 +4784,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4037,7 +4795,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4056,7 +4814,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4067,7 +4825,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_cbranch_execnz .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4081,7 +4839,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -4092,7 +4850,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_cbranch_execnz .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4106,7 +4864,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -4117,7 +4875,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_cbranch_execnz .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4135,7 +4893,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4146,7 +4904,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4160,7 +4918,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -4171,7 +4929,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4185,7 +4943,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -4196,7 +4954,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_cbranch_execnz .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4221,7 +4979,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 @@ -4232,7 +4990,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4251,7 +5009,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 @@ -4262,7 +5020,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4279,7 +5037,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 @@ -4290,7 +5048,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4317,7 +5075,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -4328,7 +5086,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4352,7 +5110,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -4363,7 +5121,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4385,7 +5143,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -4396,7 +5154,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -4426,7 +5184,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -4437,7 +5195,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4459,7 +5217,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -4470,7 +5228,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4492,29 +5250,174 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_endpgm +entry: + %ptr = getelementptr i32, ptr %out, i32 %index + %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst + store i32 %tmp0, ptr %out2 + ret void +} + +define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_max_u32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_max_u32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dword v[0:1], v2 -; GCN3-NEXT: s_endpgm -entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst - store i32 %tmp0, ptr %out2 - ret void +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } ; --------------------------------------------------------------------- @@ -4527,7 +5430,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4538,7 +5441,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4548,7 +5451,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4559,7 +5462,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4569,7 +5472,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4580,7 +5483,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4596,7 +5499,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4607,7 +5510,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4619,7 +5522,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4630,7 +5533,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4640,7 +5543,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4651,7 +5554,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4666,7 +5569,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -4677,7 +5580,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -4688,7 +5591,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -4699,7 +5602,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -4710,7 +5613,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -4721,7 +5624,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -4738,7 +5641,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -4749,7 +5652,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4761,7 +5664,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -4772,7 +5675,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4782,7 +5685,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -4793,7 +5696,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -4811,7 +5714,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4822,7 +5725,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4834,7 +5737,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4845,7 +5748,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4857,7 +5760,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4868,7 +5771,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4886,7 +5789,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4897,7 +5800,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4911,7 +5814,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4922,7 +5825,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4934,7 +5837,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4945,7 +5848,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4964,7 +5867,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4975,7 +5878,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4989,7 +5892,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5000,7 +5903,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5014,7 +5917,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5025,7 +5928,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5043,7 +5946,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -5054,7 +5957,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5068,7 +5971,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5079,7 +5982,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5093,7 +5996,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5104,7 +6007,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5113,6 +6016,151 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ret i32 %result } +define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_min_u32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_min_u32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw min ; --------------------------------------------------------------------- @@ -5123,7 +6171,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5134,7 +6182,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5144,7 +6192,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5155,7 +6203,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5165,7 +6213,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5176,7 +6224,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5192,7 +6240,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5203,7 +6251,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5215,7 +6263,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5226,7 +6274,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5236,7 +6284,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5247,7 +6295,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5262,7 +6310,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -5273,7 +6321,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -5284,7 +6332,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -5295,7 +6343,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -5306,7 +6354,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -5317,7 +6365,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -5334,7 +6382,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -5345,7 +6393,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5357,7 +6405,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -5368,7 +6416,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5378,7 +6426,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -5389,7 +6437,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -5407,7 +6455,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5418,7 +6466,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5430,7 +6478,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5441,7 +6489,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5453,7 +6501,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5464,7 +6512,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5482,7 +6530,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5493,7 +6541,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5507,7 +6555,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5518,7 +6566,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5530,7 +6578,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5541,7 +6589,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5560,7 +6608,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -5571,7 +6619,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_cbranch_execnz .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5585,7 +6633,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5596,7 +6644,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_cbranch_execnz .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5610,7 +6658,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5621,7 +6669,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_cbranch_execnz .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5639,7 +6687,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -5650,7 +6698,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5664,7 +6712,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5675,7 +6723,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5689,7 +6737,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5700,7 +6748,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cbranch_execnz .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5725,7 +6773,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5736,7 +6784,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5755,7 +6803,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5766,7 +6814,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5783,7 +6831,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5794,7 +6842,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -5821,7 +6869,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -5832,7 +6880,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5856,7 +6904,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -5867,7 +6915,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5889,7 +6937,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -5900,7 +6948,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -5925,7 +6973,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5936,7 +6984,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5949,7 +6997,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5960,7 +7008,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5973,7 +7021,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5984,7 +7032,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -6007,7 +7055,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -6018,7 +7066,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -6040,7 +7088,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -6051,7 +7099,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -6073,7 +7121,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -6084,7 +7132,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -6098,6 +7146,151 @@ entry: ret void } +define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_min_i32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_min_i32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw uinc_wrap ; --------------------------------------------------------------------- @@ -6382,6 +7575,72 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ret i32 %result } +define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -6665,3 +7924,71 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret i32 %result } + +define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 7fc4484..4bf3e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -299,6 +299,72 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } +define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f64 ; --------------------------------------------------------------------- @@ -595,6 +661,72 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ret double %result } +define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, double %in) { +; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, double %in) { +; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -891,6 +1023,72 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } +define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1187,6 +1385,72 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } +define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1483,48 +1747,114 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_nand_i64_noret: +define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN1-NEXT: v_not_b32_e32 v5, v4 -; GCN1-NEXT: v_not_b32_e32 v4, v8 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB40_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret: +; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v4 +; GCN1-NEXT: v_not_b32_e32 v4, v8 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1539,7 +1869,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB40_1 +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1549,7 +1879,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1564,7 +1894,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB40_1 +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1583,7 +1913,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 @@ -1598,7 +1928,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB41_1 +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1613,7 +1943,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 @@ -1628,7 +1958,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB41_1 +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1638,7 +1968,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1653,7 +1983,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB41_1 +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1671,7 +2001,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -1686,7 +2016,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_1 +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -1701,7 +2031,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -1716,7 +2046,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_1 +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -1728,7 +2058,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -1743,7 +2073,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB42_1 +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -1764,7 +2094,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -1779,7 +2109,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB43_1 +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1794,7 +2124,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -1809,7 +2139,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB43_1 +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1819,7 +2149,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -1834,7 +2164,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB43_1 +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -1860,7 +2190,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1875,7 +2205,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB44_1 +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1894,7 +2224,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1909,7 +2239,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB44_1 +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1923,7 +2253,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1938,7 +2268,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB44_1 +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1961,7 +2291,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1976,7 +2306,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB45_1 +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1995,7 +2325,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2010,7 +2340,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB45_1 +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2024,7 +2354,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2039,7 +2369,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB45_1 +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2063,7 +2393,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2078,7 +2408,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB46_1 +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2097,7 +2427,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2112,7 +2442,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB46_1 +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2126,7 +2456,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2141,7 +2471,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB46_1 +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2164,7 +2494,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2179,7 +2509,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB47_1 +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2198,7 +2528,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2213,7 +2543,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB47_1 +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2227,7 +2557,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2242,7 +2572,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB47_1 +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2251,6 +2581,188 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } +define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v1, v6, v2 +; GCN2-NEXT: v_not_b32_e32 v5, v0 +; GCN2-NEXT: v_not_b32_e32 v4, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN3-NEXT: v_not_b32_e32 v5, v4 +; GCN3-NEXT: v_not_b32_e32 v4, v8 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN1-NEXT: v_not_b32_e32 v7, v0 +; GCN1-NEXT: v_not_b32_e32 v6, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN2-NEXT: v_not_b32_e32 v7, v0 +; GCN2-NEXT: v_not_b32_e32 v6, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN3-NEXT: v_not_b32_e32 v5, v4 +; GCN3-NEXT: v_not_b32_e32 v4, v8 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2547,23 +3059,89 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_xor_i64_noret: +define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i64_noret: +; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_xor_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i64_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2843,6 +3421,72 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } +define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -2856,7 +3500,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2870,7 +3514,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB64_1 +; GCN1-NEXT: s_cbranch_execnz .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2883,7 +3527,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2897,7 +3541,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB64_1 +; GCN2-NEXT: s_cbranch_execnz .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2907,7 +3551,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2921,7 +3565,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB64_1 +; GCN3-NEXT: s_cbranch_execnz .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2940,7 +3584,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2954,7 +3598,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: s_cbranch_execnz .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2969,7 +3613,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2983,7 +3627,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: s_cbranch_execnz .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2993,7 +3637,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3007,7 +3651,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB65_1 +; GCN3-NEXT: s_cbranch_execnz .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3025,7 +3669,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -3039,7 +3683,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: s_cbranch_execnz .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -3054,7 +3698,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -3068,7 +3712,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: s_cbranch_execnz .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -3080,7 +3724,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3094,7 +3738,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB66_1 +; GCN3-NEXT: s_cbranch_execnz .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -3115,7 +3759,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3129,7 +3773,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: s_cbranch_execnz .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3144,7 +3788,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3158,7 +3802,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: s_cbranch_execnz .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3168,7 +3812,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3182,7 +3826,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: s_cbranch_execnz .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -3210,7 +3854,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3224,7 +3868,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB68_1 +; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3245,7 +3889,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3259,7 +3903,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB68_1 +; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3275,7 +3919,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3289,7 +3933,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB68_1 +; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3314,7 +3958,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3328,7 +3972,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB69_1 +; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3349,7 +3993,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3363,7 +4007,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB69_1 +; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3379,7 +4023,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3393,7 +4037,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB69_1 +; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3419,7 +4063,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3433,7 +4077,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB70_1 +; GCN1-NEXT: s_cbranch_execnz .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3454,7 +4098,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3468,7 +4112,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB70_1 +; GCN2-NEXT: s_cbranch_execnz .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3484,7 +4128,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -3498,7 +4142,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB70_1 +; GCN3-NEXT: s_cbranch_execnz .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3523,7 +4167,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3537,7 +4181,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB71_1 +; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3558,7 +4202,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3572,7 +4216,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB71_1 +; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3588,7 +4232,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -3602,7 +4246,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB71_1 +; GCN3-NEXT: s_cbranch_execnz .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3628,7 +4272,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3642,7 +4286,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB72_1 +; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3662,7 +4306,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3676,7 +4320,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB72_1 +; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3694,7 +4338,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3708,7 +4352,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB72_1 +; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3734,7 +4378,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -3748,7 +4392,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB73_1 +; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3771,7 +4415,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -3785,7 +4429,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB73_1 +; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -3806,7 +4450,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -3820,7 +4464,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB73_1 +; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -3850,7 +4494,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3864,7 +4508,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB74_1 +; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3882,7 +4526,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3896,7 +4540,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB74_1 +; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3914,7 +4558,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3928,7 +4572,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB74_1 +; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3951,7 +4595,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -3965,7 +4609,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB75_1 +; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3986,7 +4630,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4000,7 +4644,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB75_1 +; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4021,7 +4665,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -4035,7 +4679,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB75_1 +; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -4049,77 +4693,77 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_umax_i64_noret: +define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB76_1 +; GCN1-NEXT: s_cbranch_execnz .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_umax_i64_noret: +; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB76_1 +; GCN2-NEXT: s_cbranch_execnz .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_umax_i64_noret: +; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -4127,26 +4771,202 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB76_1 +; GCN3-NEXT: s_cbranch_execnz .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { -; GCN1-LABEL: flat_atomic_umax_i64_noret_offset: +define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i64_noret: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + ret void +} + +define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_noret_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4160,7 +4980,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB77_1 +; GCN1-NEXT: s_cbranch_execnz .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4175,7 +4995,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4189,7 +5009,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB77_1 +; GCN2-NEXT: s_cbranch_execnz .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4199,7 +5019,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4213,7 +5033,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB77_1 +; GCN3-NEXT: s_cbranch_execnz .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4231,7 +5051,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -4245,7 +5065,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB78_1 +; GCN1-NEXT: s_cbranch_execnz .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -4260,7 +5080,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -4274,7 +5094,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB78_1 +; GCN2-NEXT: s_cbranch_execnz .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -4286,7 +5106,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -4300,7 +5120,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB78_1 +; GCN3-NEXT: s_cbranch_execnz .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -4321,7 +5141,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4335,7 +5155,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB79_1 +; GCN1-NEXT: s_cbranch_execnz .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4350,7 +5170,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4364,7 +5184,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB79_1 +; GCN2-NEXT: s_cbranch_execnz .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4374,7 +5194,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -4388,7 +5208,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB79_1 +; GCN3-NEXT: s_cbranch_execnz .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -4416,7 +5236,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4430,7 +5250,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4451,7 +5271,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4465,7 +5285,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4481,7 +5301,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4495,7 +5315,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4520,7 +5340,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4534,7 +5354,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4555,7 +5375,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4569,7 +5389,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4585,7 +5405,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4599,7 +5419,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4625,7 +5445,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4639,7 +5459,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_cbranch_execnz .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4660,7 +5480,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4674,7 +5494,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_cbranch_execnz .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4690,7 +5510,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4704,7 +5524,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_cbranch_execnz .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4729,7 +5549,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4743,7 +5563,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4764,7 +5584,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4778,7 +5598,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4794,7 +5614,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4808,7 +5628,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_cbranch_execnz .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4834,7 +5654,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -4848,7 +5668,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4868,7 +5688,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -4882,7 +5702,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4900,7 +5720,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4914,7 +5734,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4940,7 +5760,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -4954,7 +5774,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4977,7 +5797,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4991,7 +5811,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5012,7 +5832,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5026,7 +5846,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -5055,7 +5875,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -5069,7 +5889,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5090,7 +5910,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -5104,7 +5924,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5125,7 +5945,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5139,7 +5959,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -5153,6 +5973,182 @@ entry: ret void } +define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw umin ; --------------------------------------------------------------------- @@ -5166,7 +6162,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5180,7 +6176,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5193,7 +6189,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5207,7 +6203,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5217,7 +6213,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5231,7 +6227,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5250,7 +6246,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5264,7 +6260,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5279,7 +6275,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5293,7 +6289,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5303,7 +6299,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5317,7 +6313,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5335,7 +6331,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -5349,7 +6345,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -5364,7 +6360,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -5378,7 +6374,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -5390,7 +6386,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5404,7 +6400,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -5425,7 +6421,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5439,7 +6435,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5454,7 +6450,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5468,7 +6464,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5478,7 +6474,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5492,7 +6488,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -5520,7 +6516,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5534,7 +6530,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5555,7 +6551,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5569,7 +6565,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5585,7 +6581,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5599,7 +6595,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5624,7 +6620,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5638,7 +6634,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5659,7 +6655,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5673,7 +6669,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5689,7 +6685,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5703,7 +6699,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5729,7 +6725,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5743,7 +6739,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5764,7 +6760,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5778,7 +6774,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5794,7 +6790,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5808,7 +6804,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5833,7 +6829,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5847,7 +6843,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5868,7 +6864,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5882,7 +6878,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5898,7 +6894,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5912,7 +6908,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5921,34 +6917,210 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw min -; --------------------------------------------------------------------- - -define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_min_i64_noret: +define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw min +; --------------------------------------------------------------------- + +define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_min_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5961,7 +7133,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -5975,7 +7147,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5985,7 +7157,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -5999,7 +7171,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6018,7 +7190,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6032,7 +7204,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6047,7 +7219,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6061,7 +7233,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6071,7 +7243,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6085,7 +7257,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6103,7 +7275,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -6117,7 +7289,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -6132,7 +7304,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -6146,7 +7318,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -6158,7 +7330,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6172,7 +7344,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -6193,7 +7365,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6207,7 +7379,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6222,7 +7394,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6236,7 +7408,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6246,7 +7418,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6260,7 +7432,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -6288,7 +7460,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6302,7 +7474,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6323,7 +7495,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6337,7 +7509,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6353,7 +7525,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6367,7 +7539,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6392,7 +7564,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6406,7 +7578,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6427,7 +7599,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6441,7 +7613,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6457,7 +7629,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6471,7 +7643,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6497,7 +7669,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6511,7 +7683,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_cbranch_execnz .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6532,7 +7704,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6546,7 +7718,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_cbranch_execnz .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6562,7 +7734,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6576,7 +7748,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_cbranch_execnz .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6601,7 +7773,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6615,7 +7787,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6636,7 +7808,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6650,7 +7822,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6666,7 +7838,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6680,7 +7852,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cbranch_execnz .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6706,7 +7878,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6720,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -6740,7 +7912,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6754,7 +7926,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -6772,7 +7944,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6786,7 +7958,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -6812,7 +7984,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -6826,7 +7998,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -6849,7 +8021,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -6863,7 +8035,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -6884,7 +8056,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -6898,7 +8070,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -6926,7 +8098,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6940,7 +8112,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -6956,7 +8128,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6970,7 +8142,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -6986,7 +8158,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -7000,7 +8172,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -7022,7 +8194,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -7036,7 +8208,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -7057,7 +8229,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -7071,7 +8243,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -7092,7 +8264,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -7106,7 +8278,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -7120,6 +8292,182 @@ entry: ret void } +define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw uinc_wrap ; --------------------------------------------------------------------- @@ -7416,6 +8764,72 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ret i64 %result } +define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -7711,3 +9125,71 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst ret i64 %result } + +define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index 5889de7..d10e049 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1933,6 +1933,646 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ret void } +define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_atomic_fadd_ret_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB12_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_ret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_ret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_ret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_ret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_atomic_fadd_noret_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB13_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_noret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_noret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB14_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB15_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 4598d23..4ec3ac2 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -353,6 +353,79 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } +define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f32 ; --------------------------------------------------------------------- @@ -703,6 +776,79 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace( ret float %result } +define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, float %in) { +; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, float %in) { +; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -1053,6 +1199,79 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } +define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1440,6 +1659,79 @@ define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) { ret i32 %result } +define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1790,25 +2082,98 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_nand_i32_noret: +define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB41_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, v4, v2 -; SI-NEXT: v_not_b32_e32 v3, v3 +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_nand_i32_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB51_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: v_not_b32_e32 v3, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -1819,7 +2184,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: s_cbranch_execnz .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1830,7 +2195,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: .LBB51_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1842,7 +2207,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: s_cbranch_execnz .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1852,7 +2217,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1864,7 +2229,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1882,7 +2247,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1897,7 +2262,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: s_cbranch_execnz .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1910,7 +2275,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: .LBB52_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1922,7 +2287,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: s_cbranch_execnz .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1932,7 +2297,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1944,7 +2309,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1963,7 +2328,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: .LBB53_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -1978,7 +2343,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: s_cbranch_execnz .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -1990,7 +2355,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: .LBB53_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -2002,7 +2367,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: s_cbranch_execnz .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -2013,7 +2378,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -2025,7 +2390,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -2044,7 +2409,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -2059,7 +2424,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: s_cbranch_execnz .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -2073,7 +2438,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -2085,7 +2450,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: s_cbranch_execnz .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2095,7 +2460,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -2107,7 +2472,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -2132,7 +2497,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v1 @@ -2147,7 +2512,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: s_cbranch_execnz .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -2165,7 +2530,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2177,7 +2542,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2188,7 +2553,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2200,7 +2565,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2223,7 +2588,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v1 @@ -2238,7 +2603,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: s_cbranch_execnz .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -2258,7 +2623,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2270,7 +2635,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2281,7 +2646,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2293,7 +2658,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2317,7 +2682,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: .LBB57_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -2332,7 +2697,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: s_cbranch_execnz .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -2352,7 +2717,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -2364,7 +2729,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2375,7 +2740,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2387,7 +2752,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2410,7 +2775,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: .LBB58_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -2425,7 +2790,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: s_cbranch_execnz .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -2445,7 +2810,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -2457,7 +2822,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: s_cbranch_execnz .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2468,7 +2833,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2480,7 +2845,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2489,6 +2854,170 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } +define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: v_not_b32_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: v_not_b32_e32 v3, v3 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: v_not_b32_e32 v3, v3 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v5, v2 +; SI-NEXT: v_not_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: v_not_b32_e32 v3, v3 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2876,95 +3405,168 @@ define i32 @global_atomic_or_0_i32_ret(ptr addrspace(1) %ptr) { ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_xor_i32_noret: +define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i32_noret: +; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_xor_i32_noret: +; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { -; SI-LABEL: global_atomic_xor_i32_noret_offset: +define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i32_noret_offset: +; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_xor_i32_noret_offset: +; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 - %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst - ret void + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } -define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_xor_i32_ret: +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i32_ret: +; VI-LABEL: global_atomic_xor_i32_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_noret_offset: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_noret_offset: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_ret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc @@ -3263,6 +3865,79 @@ define i32 @global_atomic_xor_0_i32_ret(ptr addrspace(1) %ptr) { ret i32 %result } +define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -3277,7 +3952,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3291,7 +3966,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3302,7 +3977,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: .LBB83_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3313,7 +3988,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: s_cbranch_execnz .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3323,7 +3998,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3334,7 +4009,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: s_cbranch_execnz .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3352,7 +4027,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3366,7 +4041,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3379,7 +4054,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3390,7 +4065,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: s_cbranch_execnz .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3400,7 +4075,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3411,7 +4086,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: s_cbranch_execnz .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3430,7 +4105,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB69_1: ; %atomicrmw.start +; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -3444,7 +4119,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB69_1 +; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -3456,7 +4131,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB69_1: ; %atomicrmw.start +; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -3467,7 +4142,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB69_1 +; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -3478,7 +4153,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -3489,7 +4164,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB69_1 +; GFX9-NEXT: s_cbranch_execnz .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -3508,7 +4183,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -3522,7 +4197,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -3536,7 +4211,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -3547,7 +4222,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: s_cbranch_execnz .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3557,7 +4232,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -3568,7 +4243,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: s_cbranch_execnz .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -3593,7 +4268,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, s34, v1 @@ -3607,7 +4282,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -3625,7 +4300,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3636,7 +4311,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3647,7 +4322,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 @@ -3658,7 +4333,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: s_cbranch_execnz .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3681,7 +4356,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, s34, v1 @@ -3695,7 +4370,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -3715,7 +4390,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3726,7 +4401,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3737,7 +4412,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 @@ -3748,7 +4423,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3772,7 +4447,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -3786,7 +4461,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: s_cbranch_execnz .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -3806,7 +4481,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -3817,7 +4492,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3828,7 +4503,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -3839,7 +4514,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3862,7 +4537,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -3876,7 +4551,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -3896,7 +4571,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -3907,7 +4582,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3918,7 +4593,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -3929,7 +4604,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3954,7 +4629,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -3967,7 +4642,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -3988,7 +4663,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3998,7 +4673,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4016,7 +4691,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4026,7 +4701,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4053,7 +4728,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4066,7 +4741,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4094,7 +4769,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_i32_e32 v2, s4, v3 @@ -4104,7 +4779,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4127,7 +4802,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 @@ -4137,7 +4812,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: s_cbranch_execnz .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4167,7 +4842,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4180,7 +4855,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: s_cbranch_execnz .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4199,7 +4874,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4209,7 +4884,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: s_cbranch_execnz .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4227,7 +4902,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -4237,7 +4912,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4263,7 +4938,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4276,7 +4951,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: s_cbranch_execnz .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4302,7 +4977,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_i32_e32 v2, s4, v3 @@ -4312,7 +4987,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: s_cbranch_execnz .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4335,7 +5010,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 @@ -4345,7 +5020,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: s_cbranch_execnz .LBB94_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4358,49 +5033,47 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_umax_i32_noret: +define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: .LBB95_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: v_max_i32_e32 v3, v4, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: s_cbranch_execnz .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_umax_i32_noret: +; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: .LBB95_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v3, v4, v2 +; VI-NEXT: v_max_i32_e32 v3, v4, v2 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4408,73 +5081,233 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: s_cbranch_execnz .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_umax_i32_noret: +; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: s_cbranch_execnz .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { -; SI-LABEL: global_atomic_umax_i32_noret_offset: +define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB80_1: ; %atomicrmw.start +; SI-NEXT: .LBB96_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB80_1 +; SI-NEXT: s_cbranch_execnz .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_umax_i32_noret_offset: +; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB96_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_max_i32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB97_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB97_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_u32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i32_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_noret_offset: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB98_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_noret_offset: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB80_1: ; %atomicrmw.start +; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u32_e32 v3, v4, v2 @@ -4485,7 +5318,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB80_1 +; VI-NEXT: s_cbranch_execnz .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4495,7 +5328,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 @@ -4506,7 +5339,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: s_cbranch_execnz .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4525,7 +5358,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB81_1: ; %atomicrmw.start +; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -4539,7 +5372,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -4551,7 +5384,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB81_1: ; %atomicrmw.start +; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -4562,7 +5395,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -4573,7 +5406,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -4584,7 +5417,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: s_cbranch_execnz .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -4603,7 +5436,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB82_1: ; %atomicrmw.start +; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -4617,7 +5450,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -4631,7 +5464,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB82_1: ; %atomicrmw.start +; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -4642,7 +5475,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: s_cbranch_execnz .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4652,7 +5485,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -4663,7 +5496,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: s_cbranch_execnz .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -4688,7 +5521,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB83_1: ; %atomicrmw.start +; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, s34, v1 @@ -4702,7 +5535,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -4720,7 +5553,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB83_1: ; %atomicrmw.start +; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4731,7 +5564,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4742,7 +5575,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 @@ -4753,7 +5586,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_cbranch_execnz .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4776,7 +5609,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB84_1: ; %atomicrmw.start +; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, s34, v1 @@ -4790,7 +5623,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -4810,7 +5643,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB84_1: ; %atomicrmw.start +; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4821,7 +5654,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4832,7 +5665,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 @@ -4843,7 +5676,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4867,7 +5700,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB85_1: ; %atomicrmw.start +; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4881,7 +5714,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_cbranch_execnz .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -4901,7 +5734,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB85_1: ; %atomicrmw.start +; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -4912,7 +5745,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4923,7 +5756,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -4934,7 +5767,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4957,7 +5790,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB86_1: ; %atomicrmw.start +; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4971,7 +5804,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -4991,7 +5824,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB86_1: ; %atomicrmw.start +; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -5002,7 +5835,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5013,7 +5846,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -5024,7 +5857,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5049,7 +5882,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB87_1: ; %atomicrmw.start +; SI-NEXT: .LBB105_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5062,7 +5895,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_cbranch_execnz .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5083,7 +5916,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: .LBB87_1: ; %atomicrmw.start +; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_u32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -5093,7 +5926,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_cbranch_execnz .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5111,7 +5944,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5121,7 +5954,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5148,7 +5981,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB88_1: ; %atomicrmw.start +; SI-NEXT: .LBB106_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5161,7 +5994,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cbranch_execnz .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5189,7 +6022,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB88_1: ; %atomicrmw.start +; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_u32_e32 v2, s4, v3 @@ -5199,7 +6032,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cbranch_execnz .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5222,7 +6055,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 @@ -5232,7 +6065,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cbranch_execnz .LBB106_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5263,7 +6096,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB89_1: ; %atomicrmw.start +; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5276,7 +6109,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_cbranch_execnz .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5302,7 +6135,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB89_1: ; %atomicrmw.start +; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_u32_e32 v2, s4, v3 @@ -5312,7 +6145,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5335,7 +6168,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 @@ -5345,7 +6178,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5358,6 +6191,164 @@ entry: ret void } +define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_u32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_max_u32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw umin ; --------------------------------------------------------------------- @@ -5372,7 +6363,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB90_1: ; %atomicrmw.start +; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5386,7 +6377,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_cbranch_execnz .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5397,7 +6388,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB90_1: ; %atomicrmw.start +; VI-NEXT: .LBB110_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5408,7 +6399,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_cbranch_execnz .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5418,7 +6409,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5429,7 +6420,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5447,7 +6438,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB91_1: ; %atomicrmw.start +; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5461,7 +6452,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5474,7 +6465,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB91_1: ; %atomicrmw.start +; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5485,7 +6476,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_cbranch_execnz .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5495,7 +6486,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5506,7 +6497,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5525,7 +6516,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB92_1: ; %atomicrmw.start +; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -5539,7 +6530,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -5551,7 +6542,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB92_1: ; %atomicrmw.start +; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -5562,7 +6553,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -5573,7 +6564,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -5584,7 +6575,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -5603,7 +6594,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB93_1: ; %atomicrmw.start +; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -5617,7 +6608,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -5631,7 +6622,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB93_1: ; %atomicrmw.start +; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -5642,7 +6633,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_cbranch_execnz .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5652,7 +6643,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -5663,7 +6654,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -5688,7 +6679,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB94_1: ; %atomicrmw.start +; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v0, s34, v1 @@ -5702,7 +6693,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -5720,7 +6711,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB94_1: ; %atomicrmw.start +; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v2, s6, v3 @@ -5731,7 +6722,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5742,7 +6733,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1 @@ -5753,7 +6744,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5776,7 +6767,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB95_1: ; %atomicrmw.start +; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v0, s34, v1 @@ -5790,7 +6781,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cbranch_execnz .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -5810,7 +6801,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB95_1: ; %atomicrmw.start +; VI-NEXT: .LBB115_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v2, s6, v3 @@ -5821,7 +6812,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cbranch_execnz .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5832,7 +6823,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1 @@ -5843,7 +6834,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5867,7 +6858,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB96_1: ; %atomicrmw.start +; SI-NEXT: .LBB116_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -5881,7 +6872,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_cbranch_execnz .LBB116_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -5901,7 +6892,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB96_1: ; %atomicrmw.start +; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -5912,7 +6903,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_cbranch_execnz .LBB116_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5923,7 +6914,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -5934,7 +6925,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5957,7 +6948,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB97_1: ; %atomicrmw.start +; SI-NEXT: .LBB117_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -5971,7 +6962,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_cbranch_execnz .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -5991,45 +6982,203 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB97_1: ; %atomicrmw.start +; VI-NEXT: .LBB117_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_min_u32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_min_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst + ret i32 %result +} + +define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB118_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_min_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB118_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_min_u32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB119_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB119_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_min_u32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: +; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_u32_e32 v2, s6, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 - %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret i32 %result } @@ -6047,7 +7196,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB98_1: ; %atomicrmw.start +; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6061,7 +7210,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cbranch_execnz .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6072,7 +7221,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB98_1: ; %atomicrmw.start +; VI-NEXT: .LBB120_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6083,7 +7232,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cbranch_execnz .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6093,7 +7242,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6104,7 +7253,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6122,7 +7271,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB99_1: ; %atomicrmw.start +; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6136,7 +7285,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6149,7 +7298,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB99_1: ; %atomicrmw.start +; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6160,7 +7309,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_cbranch_execnz .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6170,7 +7319,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6181,7 +7330,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6200,7 +7349,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB100_1: ; %atomicrmw.start +; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -6214,7 +7363,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -6226,7 +7375,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB100_1: ; %atomicrmw.start +; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -6237,7 +7386,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -6248,7 +7397,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -6259,7 +7408,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -6278,7 +7427,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB101_1: ; %atomicrmw.start +; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -6292,7 +7441,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -6306,7 +7455,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB101_1: ; %atomicrmw.start +; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -6317,7 +7466,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_cbranch_execnz .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6327,7 +7476,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -6338,7 +7487,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_cbranch_execnz .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -6363,7 +7512,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB102_1: ; %atomicrmw.start +; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v0, s34, v1 @@ -6377,7 +7526,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -6395,7 +7544,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB102_1: ; %atomicrmw.start +; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v2, s6, v3 @@ -6406,7 +7555,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6417,7 +7566,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 @@ -6428,7 +7577,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cbranch_execnz .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6451,7 +7600,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB103_1: ; %atomicrmw.start +; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v0, s34, v1 @@ -6465,7 +7614,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -6485,7 +7634,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB103_1: ; %atomicrmw.start +; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v2, s6, v3 @@ -6496,7 +7645,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6507,7 +7656,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 @@ -6518,7 +7667,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6542,7 +7691,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB104_1: ; %atomicrmw.start +; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -6556,7 +7705,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_cbranch_execnz .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -6576,7 +7725,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB104_1: ; %atomicrmw.start +; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -6587,7 +7736,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6598,7 +7747,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -6609,7 +7758,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6632,7 +7781,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB105_1: ; %atomicrmw.start +; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -6646,7 +7795,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -6666,7 +7815,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB105_1: ; %atomicrmw.start +; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -6677,7 +7826,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6688,7 +7837,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -6699,7 +7848,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6724,7 +7873,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB106_1: ; %atomicrmw.start +; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6737,7 +7886,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_cbranch_execnz .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -6758,7 +7907,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: .LBB106_1: ; %atomicrmw.start +; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_min_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6768,7 +7917,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -6786,7 +7935,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6796,7 +7945,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -6823,7 +7972,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB107_1: ; %atomicrmw.start +; SI-NEXT: .LBB129_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6836,7 +7985,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: s_cbranch_execnz .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -6864,7 +8013,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB107_1: ; %atomicrmw.start +; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_min_i32_e32 v2, s4, v3 @@ -6874,7 +8023,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: s_cbranch_execnz .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -6897,7 +8046,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 @@ -6907,7 +8056,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: s_cbranch_execnz .LBB129_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -6933,7 +8082,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6946,7 +8095,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -6961,7 +8110,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: .LBB130_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_min_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6971,7 +8120,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: s_cbranch_execnz .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -6985,7 +8134,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc @@ -6995,7 +8144,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7017,102 +8166,260 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dword s6, s[4:5], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .LBB131_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_min_i32_e32 v0, s8, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execnz .LBB131_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: atomic_min_i32_ret_addr64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_ashr_i32 s7, s5, 31 +; VI-NEXT: s_mov_b32 s6, s5 +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 +; VI-NEXT: s_add_u32 s6, s0, s6 +; VI-NEXT: s_addc_u32 s7, s1, s7 +; VI-NEXT: s_load_dword s5, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB131_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_min_i32_e32 v2, s4, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB131_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i32_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index + %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst + store i32 %tmp0, ptr addrspace(1) %out2 + ret void +} + +define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB132_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_min_i32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB132_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB132_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_min_i32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB132_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB133_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s8, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_min_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB133_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: atomic_min_i32_ret_addr64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB133_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s4, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_min_i32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB133_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: atomic_min_i32_ret_addr64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: s_cbranch_execnz .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] -; GFX9-NEXT: s_endpgm -entry: - %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index - %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst - store i32 %tmp0, ptr addrspace(1) %out2 - ret void +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } ; --------------------------------------------------------------------- @@ -7465,6 +8772,79 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ret i32 %result } +define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -7814,3 +9194,78 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst ret i32 %result } + +define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 380ce7f..f5c2bd6 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -367,6 +367,80 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f64 ; --------------------------------------------------------------------- @@ -731,6 +805,80 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace ret double %result } +define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, double %in) { +; SI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, double %in) { +; SI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -1095,6 +1243,80 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1459,6 +1681,80 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1823,21 +2119,95 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_nand_i64_noret: +define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB40_1: ; %atomicrmw.start +; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_nand_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1857,7 +2227,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB40_1 +; SI-NEXT: s_cbranch_execnz .LBB50_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1868,7 +2238,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB40_1: ; %atomicrmw.start +; VI-NEXT: .LBB50_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1883,7 +2253,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB40_1 +; VI-NEXT: s_cbranch_execnz .LBB50_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1893,7 +2263,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1908,7 +2278,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1926,7 +2296,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI-NEXT: .LBB51_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1946,7 +2316,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: s_cbranch_execnz .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1959,7 +2329,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: .LBB51_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1974,7 +2344,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: s_cbranch_execnz .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1984,7 +2354,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1999,7 +2369,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2022,7 +2392,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -2042,7 +2412,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: s_cbranch_execnz .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -2053,7 +2423,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: .LBB52_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -2068,7 +2438,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: s_cbranch_execnz .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -2080,7 +2450,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2095,7 +2465,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -2119,7 +2489,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: .LBB53_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -2139,7 +2509,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: s_cbranch_execnz .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -2152,7 +2522,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: .LBB53_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -2167,7 +2537,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: s_cbranch_execnz .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2177,7 +2547,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2192,7 +2562,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -2219,7 +2589,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v3 @@ -2239,7 +2609,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: s_cbranch_execnz .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 @@ -2259,7 +2629,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2274,7 +2644,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: s_cbranch_execnz .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2285,7 +2655,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2300,7 +2670,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2324,7 +2694,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v3 @@ -2344,7 +2714,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: s_cbranch_execnz .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 @@ -2364,7 +2734,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2379,7 +2749,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2390,7 +2760,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2405,7 +2775,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2430,7 +2800,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -2450,7 +2820,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: s_cbranch_execnz .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 @@ -2470,7 +2840,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -2485,7 +2855,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2496,7 +2866,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -2511,7 +2881,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2535,7 +2905,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: .LBB57_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -2555,7 +2925,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: s_cbranch_execnz .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 @@ -2575,7 +2945,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -2590,7 +2960,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2601,7 +2971,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -2616,7 +2986,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2625,6 +2995,196 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB58_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, v6, v2 +; SI-NEXT: v_not_b32_e32 v5, v4 +; SI-NEXT: v_not_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB58_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v4, v7, v3 +; VI-NEXT: v_and_b32_e32 v8, v6, v2 +; VI-NEXT: v_not_b32_e32 v5, v4 +; VI-NEXT: v_not_b32_e32 v4, v8 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, v11, v6 +; SI-NEXT: v_and_b32_e32 v1, v10, v7 +; SI-NEXT: v_not_b32_e32 v9, v0 +; SI-NEXT: v_not_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v0, v9, v3 +; VI-NEXT: v_and_b32_e32 v1, v8, v2 +; VI-NEXT: v_not_b32_e32 v7, v0 +; VI-NEXT: v_not_b32_e32 v6, v1 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2989,36 +3549,110 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_xor_i64_noret: +define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i64_noret: +; VI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_xor_i64_noret: +; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_xor_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i64_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3353,6 +3987,80 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -3367,7 +4075,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI-NEXT: .LBB80_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3386,7 +4094,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB64_1 +; SI-NEXT: s_cbranch_execnz .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3397,7 +4105,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI-NEXT: .LBB80_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3411,7 +4119,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB64_1 +; VI-NEXT: s_cbranch_execnz .LBB80_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3421,7 +4129,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3435,7 +4143,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: s_cbranch_execnz .LBB80_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3453,7 +4161,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI-NEXT: .LBB81_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3472,7 +4180,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB65_1 +; SI-NEXT: s_cbranch_execnz .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3485,7 +4193,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: .LBB81_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3499,7 +4207,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: s_cbranch_execnz .LBB81_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3509,7 +4217,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3523,7 +4231,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: s_cbranch_execnz .LBB81_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3546,7 +4254,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI-NEXT: .LBB82_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -3565,7 +4273,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB66_1 +; SI-NEXT: s_cbranch_execnz .LBB82_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3576,7 +4284,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: .LBB82_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -3590,7 +4298,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: s_cbranch_execnz .LBB82_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -3602,7 +4310,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -3616,7 +4324,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: s_cbranch_execnz .LBB82_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -3640,7 +4348,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -3659,7 +4367,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3672,7 +4380,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: .LBB83_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -3686,7 +4394,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: s_cbranch_execnz .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3696,7 +4404,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -3710,7 +4418,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: s_cbranch_execnz .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -3739,7 +4447,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] @@ -3758,7 +4466,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -3780,7 +4488,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3794,7 +4502,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: s_cbranch_execnz .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3807,7 +4515,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3821,7 +4529,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: s_cbranch_execnz .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3847,7 +4555,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB69_1: ; %atomicrmw.start +; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] @@ -3866,7 +4574,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB69_1 +; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -3888,7 +4596,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB69_1: ; %atomicrmw.start +; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3902,7 +4610,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB69_1 +; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3915,7 +4623,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3929,7 +4637,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB69_1 +; GFX9-NEXT: s_cbranch_execnz .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3956,7 +4664,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -3975,7 +4683,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -3997,7 +4705,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4011,7 +4719,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: s_cbranch_execnz .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4024,7 +4732,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -4038,7 +4746,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: s_cbranch_execnz .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4064,7 +4772,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -4083,7 +4791,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -4105,7 +4813,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4119,7 +4827,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4132,7 +4840,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -4146,7 +4854,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: s_cbranch_execnz .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4173,7 +4881,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4191,7 +4899,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4214,7 +4922,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -4227,7 +4935,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4247,7 +4955,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4260,7 +4968,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4287,7 +4995,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -4305,7 +5013,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: s_cbranch_execnz .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4333,7 +5041,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -4346,7 +5054,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4369,7 +5077,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -4382,7 +5090,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -4414,7 +5122,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4432,7 +5140,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4453,7 +5161,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -4466,7 +5174,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4486,7 +5194,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4499,7 +5207,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4525,7 +5233,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -4543,7 +5251,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4569,7 +5277,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -4582,7 +5290,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4605,7 +5313,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -4618,7 +5326,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -4631,24 +5339,20 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_umax_i64_noret: +define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SI-NEXT: s_waitcnt expcnt(0) @@ -4656,7 +5360,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] @@ -4664,21 +5368,23 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_umax_i64_noret: +; VI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -4689,23 +5395,23 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_umax_i64_noret: +; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -4713,32 +5419,218 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: s_cbranch_execnz .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { -; SI-LABEL: global_atomic_umax_i64_noret_offset: +define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB93_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB94_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i64_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB94_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst + ret void +} + +define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_noret_offset: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB95_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 ; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 @@ -4750,7 +5642,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: s_cbranch_execnz .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -4763,7 +5655,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: .LBB95_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4777,7 +5669,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: s_cbranch_execnz .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4787,7 +5679,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4801,7 +5693,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: s_cbranch_execnz .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4824,7 +5716,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: .LBB96_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -4843,7 +5735,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: s_cbranch_execnz .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -4854,7 +5746,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: .LBB96_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -4868,7 +5760,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: s_cbranch_execnz .LBB96_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -4880,7 +5772,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4894,7 +5786,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: s_cbranch_execnz .LBB96_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -4918,7 +5810,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: .LBB97_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -4937,7 +5829,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: s_cbranch_execnz .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -4950,7 +5842,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: .LBB97_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4964,7 +5856,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: s_cbranch_execnz .LBB97_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4974,7 +5866,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4988,7 +5880,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: s_cbranch_execnz .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -5017,7 +5909,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB80_1: ; %atomicrmw.start +; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] @@ -5036,7 +5928,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB80_1 +; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5058,7 +5950,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB80_1: ; %atomicrmw.start +; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5072,7 +5964,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB80_1 +; VI-NEXT: s_cbranch_execnz .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5085,7 +5977,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5099,7 +5991,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: s_cbranch_execnz .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5125,7 +6017,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB81_1: ; %atomicrmw.start +; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] @@ -5144,7 +6036,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5166,7 +6058,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB81_1: ; %atomicrmw.start +; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5180,7 +6072,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5193,7 +6085,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5207,7 +6099,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: s_cbranch_execnz .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5234,7 +6126,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB82_1: ; %atomicrmw.start +; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -5253,7 +6145,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5275,7 +6167,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB82_1: ; %atomicrmw.start +; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -5289,7 +6181,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: s_cbranch_execnz .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5302,7 +6194,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -5316,7 +6208,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: s_cbranch_execnz .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5342,7 +6234,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB83_1: ; %atomicrmw.start +; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -5361,7 +6253,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5383,7 +6275,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB83_1: ; %atomicrmw.start +; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -5397,7 +6289,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5410,7 +6302,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -5424,7 +6316,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_cbranch_execnz .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5451,7 +6343,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB84_1: ; %atomicrmw.start +; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -5469,7 +6361,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5492,7 +6384,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: .LBB84_1: ; %atomicrmw.start +; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -5505,7 +6397,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5525,7 +6417,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -5538,7 +6430,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5565,7 +6457,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB85_1: ; %atomicrmw.start +; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -5583,7 +6475,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_cbranch_execnz .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5611,7 +6503,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: .LBB85_1: ; %atomicrmw.start +; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -5624,7 +6516,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5647,7 +6539,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -5660,7 +6552,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -5691,7 +6583,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB86_1: ; %atomicrmw.start +; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -5709,7 +6601,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5735,7 +6627,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB86_1: ; %atomicrmw.start +; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -5748,7 +6640,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5771,7 +6663,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -5784,7 +6676,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -5797,6 +6689,190 @@ entry: ret void } +define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB105_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB105_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB106_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB106_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw umin ; --------------------------------------------------------------------- @@ -5811,7 +6887,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB87_1: ; %atomicrmw.start +; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5830,7 +6906,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_cbranch_execnz .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5841,7 +6917,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB87_1: ; %atomicrmw.start +; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5855,7 +6931,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5865,7 +6941,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5879,7 +6955,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5897,7 +6973,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB88_1: ; %atomicrmw.start +; SI-NEXT: .LBB108_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5916,7 +6992,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cbranch_execnz .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5929,7 +7005,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB88_1: ; %atomicrmw.start +; VI-NEXT: .LBB108_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5943,7 +7019,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cbranch_execnz .LBB108_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5953,7 +7029,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5967,7 +7043,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5990,7 +7066,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB89_1: ; %atomicrmw.start +; SI-NEXT: .LBB109_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6009,7 +7085,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_cbranch_execnz .LBB109_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6020,7 +7096,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB89_1: ; %atomicrmw.start +; VI-NEXT: .LBB109_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -6034,7 +7110,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_cbranch_execnz .LBB109_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -6046,7 +7122,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6060,7 +7136,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6084,7 +7160,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB90_1: ; %atomicrmw.start +; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6103,7 +7179,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_cbranch_execnz .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6116,7 +7192,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB90_1: ; %atomicrmw.start +; VI-NEXT: .LBB110_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6130,7 +7206,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_cbranch_execnz .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6140,7 +7216,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6154,7 +7230,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6183,7 +7259,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB91_1: ; %atomicrmw.start +; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] @@ -6202,7 +7278,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6224,7 +7300,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB91_1: ; %atomicrmw.start +; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6238,7 +7314,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_cbranch_execnz .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6251,7 +7327,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6265,7 +7341,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6291,7 +7367,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB92_1: ; %atomicrmw.start +; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] @@ -6310,7 +7386,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6332,7 +7408,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB92_1: ; %atomicrmw.start +; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6346,7 +7422,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6359,7 +7435,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6373,7 +7449,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6400,7 +7476,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB93_1: ; %atomicrmw.start +; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -6419,7 +7495,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6441,7 +7517,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB93_1: ; %atomicrmw.start +; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6455,7 +7531,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_cbranch_execnz .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6468,7 +7544,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -6482,7 +7558,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6508,7 +7584,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB94_1: ; %atomicrmw.start +; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -6527,7 +7603,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6549,7 +7625,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB94_1: ; %atomicrmw.start +; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6563,7 +7639,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6576,7 +7652,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -6590,7 +7666,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6599,24 +7675,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw min -; --------------------------------------------------------------------- - -define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_min_i64_noret: +define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB95_1: ; %atomicrmw.start +; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SI-NEXT: s_waitcnt expcnt(0) @@ -6624,7 +7696,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] @@ -6632,7 +7704,195 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB115_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB116_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB116_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw min +; --------------------------------------------------------------------- + +define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_min_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB117_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6643,7 +7903,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB95_1: ; %atomicrmw.start +; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6657,7 +7917,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cbranch_execnz .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6667,7 +7927,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6681,7 +7941,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6699,7 +7959,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB96_1: ; %atomicrmw.start +; SI-NEXT: .LBB118_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6718,7 +7978,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_cbranch_execnz .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6731,7 +7991,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB96_1: ; %atomicrmw.start +; VI-NEXT: .LBB118_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6745,7 +8005,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_cbranch_execnz .LBB118_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6755,7 +8015,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6769,7 +8029,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6792,7 +8052,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB97_1: ; %atomicrmw.start +; SI-NEXT: .LBB119_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6811,7 +8071,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_cbranch_execnz .LBB119_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6822,7 +8082,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB97_1: ; %atomicrmw.start +; VI-NEXT: .LBB119_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -6836,7 +8096,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: s_cbranch_execnz .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -6848,7 +8108,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6862,7 +8122,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6886,7 +8146,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB98_1: ; %atomicrmw.start +; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6905,7 +8165,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cbranch_execnz .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6918,7 +8178,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB98_1: ; %atomicrmw.start +; VI-NEXT: .LBB120_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6932,7 +8192,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cbranch_execnz .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6942,7 +8202,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6956,7 +8216,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6985,7 +8245,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB99_1: ; %atomicrmw.start +; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] @@ -7004,7 +8264,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7026,7 +8286,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB99_1: ; %atomicrmw.start +; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7040,7 +8300,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_cbranch_execnz .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7053,7 +8313,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7067,7 +8327,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7093,7 +8353,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB100_1: ; %atomicrmw.start +; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] @@ -7112,7 +8372,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7134,7 +8394,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB100_1: ; %atomicrmw.start +; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7148,7 +8408,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7161,7 +8421,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7175,7 +8435,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7202,7 +8462,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB101_1: ; %atomicrmw.start +; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -7221,7 +8481,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7243,7 +8503,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB101_1: ; %atomicrmw.start +; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -7257,7 +8517,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_cbranch_execnz .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7270,7 +8530,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -7284,7 +8544,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_cbranch_execnz .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7310,7 +8570,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB102_1: ; %atomicrmw.start +; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -7329,7 +8589,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7351,7 +8611,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB102_1: ; %atomicrmw.start +; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -7365,7 +8625,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7378,7 +8638,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -7392,7 +8652,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cbranch_execnz .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7419,7 +8679,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB103_1: ; %atomicrmw.start +; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7437,7 +8697,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -7460,7 +8720,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: .LBB103_1: ; %atomicrmw.start +; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -7473,7 +8733,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -7493,7 +8753,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7506,7 +8766,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7533,7 +8793,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB104_1: ; %atomicrmw.start +; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -7551,7 +8811,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_cbranch_execnz .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -7579,7 +8839,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: .LBB104_1: ; %atomicrmw.start +; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -7592,7 +8852,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -7615,7 +8875,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -7628,7 +8888,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -7658,7 +8918,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB105_1: ; %atomicrmw.start +; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7676,7 +8936,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -7693,7 +8953,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: .LBB105_1: ; %atomicrmw.start +; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -7706,7 +8966,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -7722,7 +8982,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7735,7 +8995,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7760,7 +9020,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB106_1: ; %atomicrmw.start +; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -7778,7 +9038,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_cbranch_execnz .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -7804,7 +9064,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB106_1: ; %atomicrmw.start +; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -7817,7 +9077,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -7840,7 +9100,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -7853,7 +9113,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -7866,24 +9126,208 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw uinc_wrap -; --------------------------------------------------------------------- - -define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_uinc_wrap_i64_noret: +define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB129_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB129_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB129_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB129_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB130_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB130_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB130_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB130_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw uinc_wrap +; --------------------------------------------------------------------- + +define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: global_atomic_uinc_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8230,6 +9674,80 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ret i64 %result } +define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -8593,3 +10111,79 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst ret i64 %result } + +define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index d8e5278..794c87b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -10986,9 +10986,750 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ret void } +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-NEXT: .LBB18_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-NEXT: .LBB18_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-NEXT: .LBB18_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-NEXT: .LBB18_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB18_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1164-NEXT: .LBB18_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB18_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1132-NEXT: .LBB18_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-DPP-NEXT: .LBB18_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-DPP-NEXT: .LBB18_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-DPP-NEXT: .LBB18_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1164-DPP-NEXT: .LBB18_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1132-DPP-NEXT: .LBB18_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-NEXT: .LBB19_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-NEXT: .LBB19_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-NEXT: .LBB19_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB19_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1164-NEXT: .LBB19_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB19_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1132-NEXT: .LBB19_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-DPP-NEXT: .LBB19_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-DPP-NEXT: .LBB19_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-DPP-NEXT: .LBB19_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1164-DPP-NEXT: .LBB19_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1132-DPP-NEXT: .LBB19_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!1 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index fde8341..4c829f3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -7510,7 +7510,679 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ret void } +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: .LBB12_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: .LBB12_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: .LBB12_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: .LBB12_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-NEXT: .LBB12_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-NEXT: .LBB12_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!1 = !{} + diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 2cedb3e..0b889d9 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -7510,7 +7510,678 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ret void } +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: .LBB12_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: .LBB12_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: .LBB12_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: .LBB12_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-NEXT: .LBB12_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-NEXT: .LBB12_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!1 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 4477f02..89abdb2 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1673,3 +1673,713 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void } + +define float @lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr) { +; VI-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 4.0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_add_rtn_f32 v0, v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 + ret float %result +} + +define void @lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr) { +; VI-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 4.0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_add_f32 v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: ds_add_f32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) { +; VI-LABEL: lds_atomic_fadd_ret_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_e32 v4, v3, v1 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) { +; VI-LABEL: lds_atomic_fadd_noret_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB15_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_e32 v4, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB15_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst + ret void +} + +define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { +; VI-LABEL: lds_atomic_fadd_ret_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: .LBB16_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; VI-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-NEXT: v_add_f32_e32 v5, v5, v1 +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_bfe_u32 v8, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; VI-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB16_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: s_mov_b32 s9, 0x7060302 +; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX9-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst + ret <2 x bfloat> %result +} + +define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { +; VI-LABEL: lds_atomic_fadd_noret_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v3, v0 +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; VI-NEXT: v_add_f32_e32 v4, v4, v2 +; VI-NEXT: v_add_f32_e32 v5, v5, v1 +; VI-NEXT: v_bfe_u32 v6, v4, 16, 1 +; VI-NEXT: v_bfe_u32 v8, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; VI-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; VI-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v3, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: s_mov_b32 s9, 0x7060302 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX9-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX9-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v4, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst + ret void +} + +!0 = !{} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll new file mode 100644 index 0000000..8bbbcd1 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll @@ -0,0 +1,3717 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fadd_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fadd (no return) +;--------------------------------------------------------------------- + +define void @test_atomicrmw_fadd_noret_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fsub_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +attributes #0 = { "denormal-fp-mode-f32"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" } + +!0 = !{} +;. +; GFX940: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll new file mode 100644 index 0000000..e1890da --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll @@ -0,0 +1,1685 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fadd_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8 +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fsub_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } + +!0 = !{} +;. +; GFX940: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll new file mode 100644 index 0000000..b711b9f --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll @@ -0,0 +1,828 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define i32 @test_atomicrmw_xchg_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw add +;--------------------------------------------------------------------- + +; add is supported over PCIe, so no expansion is necessary +define i32 @test_atomicrmw_add_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw sub +;--------------------------------------------------------------------- + +; expansion is necessary, sub is not supported over PCIe +define i32 @test_atomicrmw_sub_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw and +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_and_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw nand +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported +define i32 @test_atomicrmw_nand_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw or +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_or_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw xor +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_xor_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw max +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_max_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw min +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_min_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umax +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_umax_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umin +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_umin_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw uinc_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_uinc_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw udec_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_udec_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +!0 = !{} +;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll new file mode 100644 index 0000000..d67bf2e --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll @@ -0,0 +1,828 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define i64 @test_atomicrmw_xchg_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw add +;--------------------------------------------------------------------- + +; add is supported over PCIe, so no expansion is necessary +define i64 @test_atomicrmw_add_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw sub +;--------------------------------------------------------------------- + +; expansion is necessary, sub is not supported over PCIe +define i64 @test_atomicrmw_sub_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw and +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_and_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw nand +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported +define i64 @test_atomicrmw_nand_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw or +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_or_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw xor +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_xor_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw max +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_max_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw min +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_min_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umax +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_umax_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umin +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_umin_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw uinc_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_uinc_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw udec_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_udec_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +!0 = !{} +;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 8c7d8f5..337e51f 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -1750,7 +1750,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 @@ -1766,7 +1766,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 @@ -1803,7 +1803,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1819,7 +1819,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1835,7 +1835,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1859,7 +1859,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1880,7 +1880,7 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 @@ -2102,7 +2102,7 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { ; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]] +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR9:[0-9]+]] ; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -2115,7 +2115,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; ALL-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; ALL-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -2259,6 +2259,2197 @@ define bfloat @test_atomicrmw_fadd_bf16_flat_system_align4(ptr %ptr, bfloat %val ret bfloat %res } +define void @test_atomicrmw_fadd_f32_global_system_noret(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret void +} + +define float @test_atomicrmw_fadd_f32_global_system_ret(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_daz_global_system_noret(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret void +} + +define float @test_atomicrmw_fadd_f32_daz_global_system_ret(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #4 { +; CI-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #4 { +; CI-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_local_noret(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX908-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX11-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic + ret void +} + +define float @test_atomicrmw_fadd_f32_local_ret(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX9-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX9-NEXT: ret float [[RET]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX908-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX908-NEXT: ret float [[RET]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX90A-NEXT: ret float [[RET]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX11-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX11-NEXT: ret float [[RET]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX9-NEXT: ret float [[RET]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX908-NEXT: ret float [[RET]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret float [[RET]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RET]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic + ret double %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret double %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic + ret double %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret double %ret +} + +define void @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret double %ret +} + +define void @test_atomicrmw_fadd_f32_region_noret(ptr addrspace(2) %ptr, float %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f32_region_noret( +; ALL-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(2) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(2) %ptr, float %value monotonic + ret void +} + +define float @test_atomicrmw_fadd_f32_region_ret(ptr addrspace(2) %ptr, float %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f32_region_ret( +; ALL-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(2) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(2) %ptr, float %value monotonic + ret float %ret +} + +define void @test_atomicrmw_fadd_f64_region_noret(ptr addrspace(2) %ptr, double %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f64_region_noret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(2) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(2) %ptr, double %value monotonic + ret void +} + +define double @test_atomicrmw_fadd_f64_region_ret(ptr addrspace(2) %ptr, double %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f64_region_ret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(2) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(2) %ptr, double %value monotonic + ret double %ret +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent(ptr %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } +attributes #3 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #4 = { "denormal-fp-math-f32"="dynamic,dynamic" } +attributes #5 = { "denormal-fp-math"="dynamic,dynamic" } + +!0 = !{} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll new file mode 100644 index 0000000..01a2309 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll @@ -0,0 +1,859 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; TODO: atomicrmw xchg +;--------------------------------------------------------------------- + +; ; xchg is supported over PCIe, so no expansion is necessary +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst +; ret <2 x bfloat> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 +; ret <2 x bfloat> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 +; ret <2 x bfloat> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 +; ret <2 x bfloat> %res +; } + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } + +!0 = !{} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll new file mode 100644 index 0000000..2a1824b --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll @@ -0,0 +1,859 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; TODO: atomicrmw xchg +;--------------------------------------------------------------------- + +; ; xchg is supported over PCIe, so no expansion is necessary +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst +; ret <2 x half> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 +; ret <2 x half> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 +; ret <2 x half> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 +; ret <2 x half> %res +; } + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } + +!0 = !{} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index e32c218..097d7b6 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -124,6 +124,16 @@ define i32 @test_atomicrmw_xor_0_global_system(ptr addrspace(1) %ptr) { } +define i32 @test_atomicrmw_or_0_global_system__metadata(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__metadata( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; CHECK-NEXT: ret i32 [[RES]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + !0 = !{} ;. ; CHECK: [[META0]] = !{} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll index bfbe489..9ea2db8 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -182,4 +182,14 @@ entry: ret void } +; CHECK-LABEL: @atomicrmw_add_global_to_flat_preserve_amdgpu_md( +; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(1) %global.ptr, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 +define i32 @atomicrmw_add_global_to_flat_preserve_amdgpu_md(ptr addrspace(1) %global.ptr, i32 %y) #0 { + %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr + %ret = atomicrmw add ptr %cast, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %ret +} + attributes #0 = { nounwind } + +!0 = !{} diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll new file mode 100644 index 0000000..ec7edd2 --- /dev/null +++ b/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s + +; Ensure that custom metadata survives inlining + +define i32 @atomic_xor(ptr addrspace(1) %ptr, i32 %val) { +; CHECK-LABEL: define i32 @atomic_xor( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.no.remote.memory.access [[META0]] +; CHECK-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %val monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @caller(ptr addrspace(1) %ptr, i32 %val) { +; CHECK-LABEL: define i32 @caller( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) { +; CHECK-NEXT: [[RES_I:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; CHECK-NEXT: ret i32 [[RES_I]] +; + %res = call i32 @atomic_xor(ptr addrspace(1) %ptr, i32 %val) + ret i32 %res +} + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. -- cgit v1.1 From c69efcd54879835085cf03a09e1eec28dc80e1d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 19 Apr 2024 15:55:24 -0700 Subject: [flang][cuda] Relax assumed size check on object with device attribute (#89466) Assumed size arrays are apparently allowed with attribute device. --- flang/lib/Semantics/check-declarations.cpp | 5 ----- flang/test/Semantics/cuf03.cuf | 3 +-- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index adbd21d..6fcee96 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -948,11 +948,6 @@ void CheckHelper::CheckObjectEntity( "Component '%s' with ATTRIBUTES(DEVICE) must also be allocatable"_err_en_US, symbol.name()); } - if (IsAssumedSizeArray(symbol)) { - messages_.Say( - "Object '%s' with ATTRIBUTES(DEVICE) may not be assumed size"_err_en_US, - symbol.name()); - } break; case common::CUDADataAttr::Managed: if (!IsAutomatic(symbol) && !IsAllocatable(symbol) && diff --git a/flang/test/Semantics/cuf03.cuf b/flang/test/Semantics/cuf03.cuf index 020a172..472d53d 100644 --- a/flang/test/Semantics/cuf03.cuf +++ b/flang/test/Semantics/cuf03.cuf @@ -57,8 +57,7 @@ module m contains attributes(device) subroutine devsubr(n,da) integer, intent(in) :: n - !ERROR: Object 'da' with ATTRIBUTES(DEVICE) may not be assumed size - real, device :: da(*) + real, device :: da(*) ! ok real, managed :: ma(n) ! ok !WARNING: Pointer 'dp' may not be associated in a device subprogram real, device, pointer :: dp -- cgit v1.1 From f09f99ed329f58c79fba43abf5fc73a28a0e2055 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 19 Apr 2024 16:06:25 -0700 Subject: [RISCV] Add RISCVTuneProcessorModel to 'generic' CPU. NFC Remove hardcode GENERIC cpu from RISCVTargetDefEmitter.cpp. --- llvm/lib/Target/RISCV/RISCVProcessors.td | 2 +- llvm/utils/TableGen/RISCVTargetDefEmitter.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index f9a557e..3c86036 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -66,7 +66,7 @@ def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64", GenericTuneInfo; // Support generic for compatibility with other targets. The triple will be used // to change to the appropriate rv32/rv64 version. -def : ProcessorModel<"generic", NoSchedModel, []>, GenericTuneInfo; +def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo; def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32", RocketModel, diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp index e57bc6f..62916bd 100644 --- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp +++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp @@ -82,7 +82,6 @@ static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) { OS << "#ifndef TUNE_PROC\n" << "#define TUNE_PROC(ENUM, NAME)\n" << "#endif\n\n"; - OS << "TUNE_PROC(GENERIC, \"generic\")\n"; for (const Record *Rec : RK.getAllDerivedDefinitions("RISCVTuneProcessorModel")) { -- cgit v1.1 From 11019be4cf75ff40d2da84094477ab5dac818c99 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 19 Apr 2024 16:34:43 -0700 Subject: [test][sancov] Regenerate with update_test_checks (#89457) Prepare for #89458 --- .../SanitizerCoverage/inline-bool-flag.ll | 29 ++++--- .../SanitizerCoverage/stack-depth.ll | 92 +++++++++++++++++----- 2 files changed, 90 insertions(+), 31 deletions(-) diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll index be8b6c8..1380e6a 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll @@ -1,23 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 ; Test -sanitizer-coverage-inline-bool-flag=1 ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -S | FileCheck %s -; CHECK: $foo = comdat nodeduplicate -; CHECK: @__sancov_gen_ = private global [1 x i1] zeroinitializer, section "__sancov_bools", comdat($foo), align 1{{$}} -; CHECK: @__start___sancov_bools = extern_weak hidden global i1 -; CHECK-NEXT: @__stop___sancov_bools = extern_weak hidden global i1 -; CHECK: @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_bool_flag], section "llvm.metadata" -; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr @__sancov_gen_], section "llvm.metadata" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" +;. +; CHECK: @__sancov_lowest_stack = external thread_local(initialexec) global i64 +; CHECK: @__sancov_gen_ = private global [1 x i1] zeroinitializer, section "__sancov_bools", comdat($foo), align 1 +; CHECK: @__start___sancov_bools = extern_weak hidden global i1 +; CHECK: @__stop___sancov_bools = extern_weak hidden global i1 +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @sancov.module_ctor_bool_flag, ptr @sancov.module_ctor_bool_flag }] +; CHECK: @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_bool_flag], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr @__sancov_gen_], section "llvm.metadata" +;. define void @foo() { -; CHECK-LABEL: @foo( +; CHECK-LABEL: define void @foo() comdat { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i1, ptr @__sancov_gen_, align 1, !nosanitize ![[#EMPTY:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i1, ptr @__sancov_gen_, align 1, !nosanitize [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i1 [[TMP0]], false ; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]] ; CHECK: 2: -; CHECK-NEXT: store i1 true, ptr @__sancov_gen_, align 1, !nosanitize ![[#EMPTY:]] +; CHECK-NEXT: store i1 true, ptr @__sancov_gen_, align 1, !nosanitize [[META0]] ; CHECK-NEXT: br label [[TMP3]] ; CHECK: 3: ; CHECK-NEXT: ret void @@ -25,6 +29,9 @@ define void @foo() { entry: ret void } -; CHECK: call void @__sanitizer_cov_bool_flag_init(ptr @__start___sancov_bools, ptr @__stop___sancov_bools) -; CHECK: ![[#EMPTY]] = !{} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll index 0e5eaae9..908bca6 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll @@ -1,43 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 ; This check verifies that stack depth instrumentation works correctly. -; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 \ -; RUN: -sanitizer-coverage-stack-depth -S | FileCheck %s -; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 \ -; RUN: -sanitizer-coverage-stack-depth -sanitizer-coverage-trace-pc-guard \ -; RUN: -S | FileCheck %s +; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -S | FileCheck %s --check-prefixes=L1 +; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-stack-depth -S -sanitizer-coverage-trace-pc-guard | FileCheck %s --check-prefixes=L3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; CHECK: @__sancov_lowest_stack = thread_local(initialexec) global i64 -1 @__sancov_lowest_stack = thread_local global i64 0, align 8 +;. +; L1: @__sancov_lowest_stack = thread_local(initialexec) global i64 -1, align 8 +;. +; L3: @__sancov_lowest_stack = thread_local(initialexec) global i64 -1, align 8 +; L3: @__sancov_gen_ = private global [1 x i32] zeroinitializer, section "__sancov_guards", comdat($foo), align 4 +; L3: @__sancov_gen_.1 = private global [1 x i32] zeroinitializer, section "__sancov_guards", comdat($bar), align 4 +; L3: @__sancov_gen_.2 = private global [1 x i32] zeroinitializer, section "__sancov_guards", comdat($_ZTW21__sancov_lowest_stack), align 4 +; L3: @__start___sancov_guards = extern_weak hidden global i32 +; L3: @__stop___sancov_guards = extern_weak hidden global i32 +; L3: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @sancov.module_ctor_trace_pc_guard, ptr @sancov.module_ctor_trace_pc_guard }] +; L3: @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_trace_pc_guard], section "llvm.metadata" +; L3: @llvm.compiler.used = appending global [3 x ptr] [ptr @__sancov_gen_, ptr @__sancov_gen_.1, ptr @__sancov_gen_.2], section "llvm.metadata" +;. define i32 @foo() { +; L1-LABEL: define i32 @foo() { +; L1-NEXT: entry: +; L1-NEXT: ret i32 7 +; +; L3-LABEL: define i32 @foo() comdat { +; L3-NEXT: entry: +; L3-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR2:[0-9]+]] +; L3-NEXT: ret i32 7 +; entry: -; CHECK-LABEL: define i32 @foo -; CHECK-NOT: call ptr @llvm.frameaddress.p0(i32 0) -; CHECK-NOT: @__sancov_lowest_stack -; CHECK: ret i32 7 ret i32 7 } define i32 @bar() { +; L1-LABEL: define i32 @bar() { +; L1-NEXT: entry: +; L1-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; L1-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; L1-NEXT: [[TMP2:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0:![0-9]+]] +; L1-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] +; L1-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]] +; L1: 4: +; L1-NEXT: store i64 [[TMP1]], ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0]] +; L1-NEXT: br label [[TMP5]] +; L1: 5: +; L1-NEXT: [[CALL:%.*]] = call i32 @foo() +; L1-NEXT: ret i32 [[CALL]] +; +; L3-LABEL: define i32 @bar() comdat { +; L3-NEXT: entry: +; L3-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_.1) #[[ATTR2]] +; L3-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; L3-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; L3-NEXT: [[TMP2:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0:![0-9]+]] +; L3-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] +; L3-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]] +; L3: 4: +; L3-NEXT: store i64 [[TMP1]], ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0]] +; L3-NEXT: br label [[TMP5]] +; L3: 5: +; L3-NEXT: [[CALL:%.*]] = call i32 @foo() +; L3-NEXT: ret i32 [[CALL]] +; entry: -; CHECK-LABEL: define i32 @bar -; CHECK: [[framePtr:%[^ \t]+]] = call ptr @llvm.frameaddress.p0(i32 0) -; CHECK: [[frameInt:%[^ \t]+]] = ptrtoint ptr [[framePtr]] to [[intType:i[0-9]+]] -; CHECK: [[lowest:%[^ \t]+]] = load [[intType]], ptr @__sancov_lowest_stack -; CHECK: [[cmp:%[^ \t]+]] = icmp ult [[intType]] [[frameInt]], [[lowest]] -; CHECK: br i1 [[cmp]], label %[[ifLabel:[^ \t]+]], label -; CHECK: [[ifLabel]]: -; CHECK: store [[intType]] [[frameInt]], ptr @__sancov_lowest_stack -; CHECK: %call = call i32 @foo() -; CHECK: ret i32 %call %call = call i32 @foo() ret i32 %call } define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() { +; L1-LABEL: define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() { +; L1-NEXT: ret ptr @__sancov_lowest_stack +; +; L3-LABEL: define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() comdat { +; L3-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_.2) #[[ATTR2]] +; L3-NEXT: ret ptr @__sancov_lowest_stack +; ret ptr @__sancov_lowest_stack } +;. +; L1: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; L3: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; L3: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; L3: attributes #[[ATTR2]] = { nomerge } +;. +; L1: [[META0]] = !{} +;. +; L3: [[META0]] = !{} +;. -- cgit v1.1 From c8627e4e0c8ac453844638522b887ac7b7687672 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 19 Apr 2024 16:36:15 -0700 Subject: Revert "[lldb] Provide a better error message for missing symbols (#89433)" This reverts commit 08163cd9d82690e808c28515523b5fd0923d7b38. I accidentally broke the test while addressing review feedback. --- lldb/source/Expression/IRExecutionUnit.cpp | 4 +--- lldb/test/API/lang/cpp/constructors/TestCppConstructors.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp index 07df8c5..cb9bee8 100644 --- a/lldb/source/Expression/IRExecutionUnit.cpp +++ b/lldb/source/Expression/IRExecutionUnit.cpp @@ -431,9 +431,7 @@ void IRExecutionUnit::GetRunnableInfo(Status &error, lldb::addr_t &func_addr, } m_failed_lookups.clear(); - ss.PutCString( - "\nHint: The expression tried to call a function that is not present " - "in the target, perhaps because it was optimized out by the compiler."); + error.SetErrorString(ss.GetString()); return; diff --git a/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py b/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py index 140877a..6724bfc 100644 --- a/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py +++ b/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py @@ -47,7 +47,7 @@ class TestCase(TestBase): self.expect( "expr ClassWithDeletedDefaultCtor().value", error=True, - substrs=["Couldn't look up symbols:", "function missing"], + substrs=["Couldn't look up symbols:"], ) @skipIfWindows # Can't find operator new. -- cgit v1.1 From 6a35ee8077647b32626c3c41f9d9da4dae6670fc Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 19 Apr 2024 22:14:55 +0200 Subject: [lldb] Provide a better error message for missing symbols (#89433) This adds a hint to the missing symbols error message to make it easier to understand what this means to users. [Reapplies an earlier patch with a test fix.] --- lldb/source/Expression/IRExecutionUnit.cpp | 4 +++- lldb/test/API/lang/cpp/constructors/TestCppConstructors.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp index cb9bee8..07df8c5 100644 --- a/lldb/source/Expression/IRExecutionUnit.cpp +++ b/lldb/source/Expression/IRExecutionUnit.cpp @@ -431,7 +431,9 @@ void IRExecutionUnit::GetRunnableInfo(Status &error, lldb::addr_t &func_addr, } m_failed_lookups.clear(); - + ss.PutCString( + "\nHint: The expression tried to call a function that is not present " + "in the target, perhaps because it was optimized out by the compiler."); error.SetErrorString(ss.GetString()); return; diff --git a/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py b/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py index 6724bfc..baf06e4 100644 --- a/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py +++ b/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py @@ -47,7 +47,7 @@ class TestCase(TestBase): self.expect( "expr ClassWithDeletedDefaultCtor().value", error=True, - substrs=["Couldn't look up symbols:"], + substrs=["Couldn't look up symbols:", "function", "optimized out"], ) @skipIfWindows # Can't find operator new. -- cgit v1.1 From 7fcca112034f55c26e943c13fd0d57adfbe96705 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Sat, 20 Apr 2024 07:45:20 +0800 Subject: [clang-tidy] bugprone-lambda-function-name ignore macro in captures (#89076) --- .../clang-tidy/bugprone/LambdaFunctionNameCheck.cpp | 14 +++++++++++--- clang-tools-extra/docs/ReleaseNotes.rst | 4 ++++ .../checkers/bugprone/lambda-function-name.cpp | 19 +++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp index 5260a8b..32f5edd 100644 --- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp @@ -8,7 +8,9 @@ #include "LambdaFunctionNameCheck.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/DeclCXX.h" #include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/Preprocessor.h" @@ -56,6 +58,8 @@ private: LambdaFunctionNameCheck::SourceRangeSet* SuppressMacroExpansions; }; +AST_MATCHER(CXXMethodDecl, isInLambda) { return Node.getParent()->isLambda(); } + } // namespace LambdaFunctionNameCheck::LambdaFunctionNameCheck(StringRef Name, @@ -69,9 +73,13 @@ void LambdaFunctionNameCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { } void LambdaFunctionNameCheck::registerMatchers(MatchFinder *Finder) { - // Match on PredefinedExprs inside a lambda. - Finder->addMatcher(predefinedExpr(hasAncestor(lambdaExpr())).bind("E"), - this); + Finder->addMatcher( + cxxMethodDecl(isInLambda(), + hasBody(forEachDescendant( + predefinedExpr(hasAncestor(cxxMethodDecl().bind("fn"))) + .bind("E"))), + equalsBoundNode("fn")), + this); } void LambdaFunctionNameCheck::registerPPCallbacks( diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index a457e6f..9ef1d38 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -155,6 +155,10 @@ Changes in existing checks ` check to ignore code within unevaluated contexts, such as ``decltype``. +- Improved :doc:`bugprone-lambda-function-name` + check by ignoring ``__func__`` macro in lambda captures, initializers of + default parameters and nested function declarations. + - Improved :doc:`bugprone-non-zero-enum-to-bool-conversion ` check by eliminating false positives resulting from direct usage of bitwise operators diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp index 936ee87..5c2bb57 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp @@ -19,6 +19,22 @@ void Positives() { // CHECK-MESSAGES-NO-CONFIG: :[[@LINE-1]]:8: warning: inside a lambda, '__FUNCTION__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] [] { EMBED_IN_ANOTHER_MACRO1; }(); // CHECK-MESSAGES-NO-CONFIG: :[[@LINE-1]]:8: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + [] { + __func__; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + struct S { + void f() { + __func__; + [] { + __func__; + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + }(); + __func__; + } + }; + __func__; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + }(); } #define FUNC_MACRO_WITH_FILE_AND_LINE Foo(__func__, __FILE__, __LINE__) @@ -40,4 +56,7 @@ void Negatives() { [] { FUNC_MACRO_WITH_FILE_AND_LINE; }(); [] { FUNCTION_MACRO_WITH_FILE_AND_LINE; }(); [] { EMBED_IN_ANOTHER_MACRO2; }(); + + [] (const char* func = __func__) { func; }(); + [func=__func__] { func; }(); } -- cgit v1.1 From 777d2e54a9b69463a30c305a0d1a34fef90dbe93 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 19 Apr 2024 17:00:57 -0700 Subject: [memprof] Drop the trait parameter (NFC) (#89461) OnDiskIterableChainedHashTable::Create can default-contruct a trait object for us. We don't need to construct one on our own unless we need to customize something (like a version number). --- llvm/lib/ProfileData/InstrProfReader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index b05bad7..cefb6af 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1272,13 +1272,13 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start, MemProfFrameTable.reset(MemProfFrameHashTable::Create( /*Buckets=*/Start + FrameTableOffset, /*Payload=*/Start + FramePayloadOffset, - /*Base=*/Start, memprof::FrameLookupTrait())); + /*Base=*/Start)); if (Version >= memprof::Version2) MemProfCallStackTable.reset(MemProfCallStackHashTable::Create( /*Buckets=*/Start + CallStackTableOffset, /*Payload=*/Start + CallStackPayloadOffset, - /*Base=*/Start, memprof::CallStackLookupTrait())); + /*Base=*/Start)); #ifdef EXPENSIVE_CHECKS // Go through all the records and verify that CSId has been correctly -- cgit v1.1 From f3587d41064ce22330231baee1553b210777f3e3 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 19 Apr 2024 17:01:18 -0700 Subject: [sancov] Apply branch weights when checking counters (#89458) It reduces instrumentation overhead by ~50%. --- llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp | 10 +++++++--- .../test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll | 3 ++- llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll | 6 ++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 17c1c44..239e7c6 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" @@ -979,8 +980,9 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, FunctionBoolArray->getValueType(), FunctionBoolArray, {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)}); auto Load = IRB.CreateLoad(Int1Ty, FlagPtr); - auto ThenTerm = - SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false); + auto ThenTerm = SplitBlockAndInsertIfThen( + IRB.CreateIsNull(Load), &*IP, false, + MDBuilder(IRB.getContext()).createBranchWeights(1, (1 << 20) - 1)); IRBuilder<> ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr); Load->setNoSanitizeMetadata(); @@ -997,7 +999,9 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy); auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack); auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack); - auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false); + auto ThenTerm = SplitBlockAndInsertIfThen( + IsStackLower, &*IP, false, + MDBuilder(IRB.getContext()).createBranchWeights(1, (1 << 20) - 1)); IRBuilder<> ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack); LowestStack->setNoSanitizeMetadata(); diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll index 1380e6a..a6d7ae6 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll @@ -19,7 +19,7 @@ define void @foo() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i1, ptr @__sancov_gen_, align 1, !nosanitize [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i1 [[TMP0]], false -; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: 2: ; CHECK-NEXT: store i1 true, ptr @__sancov_gen_, align 1, !nosanitize [[META0]] ; CHECK-NEXT: br label [[TMP3]] @@ -34,4 +34,5 @@ entry: ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } ;. ; CHECK: [[META0]] = !{} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} ;. diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll index 908bca6..00547af 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll @@ -43,7 +43,7 @@ define i32 @bar() { ; L1-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 ; L1-NEXT: [[TMP2:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0:![0-9]+]] ; L1-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] -; L1-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]] +; L1-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1:![0-9]+]] ; L1: 4: ; L1-NEXT: store i64 [[TMP1]], ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0]] ; L1-NEXT: br label [[TMP5]] @@ -58,7 +58,7 @@ define i32 @bar() { ; L3-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 ; L3-NEXT: [[TMP2:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0:![0-9]+]] ; L3-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] -; L3-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]] +; L3-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1:![0-9]+]] ; L3: 4: ; L3-NEXT: store i64 [[TMP1]], ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0]] ; L3-NEXT: br label [[TMP5]] @@ -90,6 +90,8 @@ define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() { ; L3: attributes #[[ATTR2]] = { nomerge } ;. ; L1: [[META0]] = !{} +; L1: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} ;. ; L3: [[META0]] = !{} +; L3: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} ;. -- cgit v1.1