diff options
| author | anjenner <161845516+anjenner@users.noreply.github.com> | 2025-12-09 23:13:33 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-12-09 23:13:33 +0000 |
| commit | 27651133e213a6a1eb4d0e47837625cee3613111 (patch) | |
| tree | dfc27f4c48aefacb3fc7e2a709f8fc92a19bd7e7 /llvm/test/CodeGen | |
| parent | 0a2e56df64c936bacc746aeb94878d66ee00dec3 (diff) | |
| download | llvm-main.zip llvm-main.tar.gz llvm-main.tar.bz2 | |
These both perform conditional subtraction, returning the minuend and
zero respectively, if the difference is negative.
Diffstat (limited to 'llvm/test/CodeGen')
3 files changed, 140 insertions, 585 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll deleted file mode 100644 index bff4771..0000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ /dev/null @@ -1,270 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12 - -define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret i32 %ret -} - -define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub_offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret i32 %ret -} - -define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub_nortn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub_offset_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub_offset_nortn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: global_store_dword v[0:1], v0, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - store i32 %ret, ptr addrspace(1) poison - ret void -} - -define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -attributes #0 = { nounwind willreturn } -attributes #1 = { argmemonly nounwind } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 4af2d58..d281492 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -2,190 +2,222 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) - -define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { -; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: +define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) { +; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i32, ptr %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { -; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i32, ptr %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) { -; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: +define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) { +; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: +; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i32, ptr %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + %gep = getelementptr i32, ptr %addr, i32 4 + %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr %use ret void } -define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { -; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: +define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { +; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { -; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { -; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: +define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { +; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: +; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { -; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: +define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { +; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -193,9 +225,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: +; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -203,15 +237,17 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { -; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: +define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -219,9 +255,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -229,38 +267,44 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { -; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: +define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { +; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: ds_store_b32 v1, v0 ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: +; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: ds_store_b32 v1, v0 ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(3) %use ret void } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll deleted file mode 100644 index 243cd59..0000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ /dev/null @@ -1,219 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12 - -define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - ret void -} - -define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - ret void -} - -define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) - ret void -} - -define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) - ret void -} - -define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - ret void -} - -define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - ret void -} - -define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) - ret void -} - -define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) - ret void -} - -declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 - -attributes #0 = { nounwind } -attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" } - |
