aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll57
-rw-r--r--llvm/test/Bitcode/amdgcn-atomic.ll147
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll236
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll219
5 files changed, 287 insertions, 642 deletions
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
index 15355ea..d9e51c3 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
@@ -15,62 +15,5 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) {
ret void
}
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
- store i32 %ret, ptr addrspace(1) %ptr, align 4
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 {
-entry:
- %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
- store i32 %val, ptr addrspace(3) %use
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 {
-entry:
- %gep = getelementptr i32, ptr %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
- store i32 %val, ptr %use
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 {
-entry:
- %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
- store i32 %val, ptr addrspace(1) %use
- ret void
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1
-
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 3e28cd0..e9194ea 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -420,5 +420,152 @@ define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %pt
attributes #0 = { argmemonly nounwind willreturn }
+define void @atomic_usub_cond(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+ ret void
+}
+
+define void @atomic_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+ ret void
+}
+
+; Test some invalid ordering handling
+define void @ordering_usub_cond_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+ %result3 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result4 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result5 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result6 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result7 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result8 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result9 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result10 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result11 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 -1, i64 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result12 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result13 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 1, i64 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") monotonic, align 8
+ %result14 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 2, i64 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result15 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result16 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 4, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result17 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 5, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result18 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 6, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result19 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result20 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result21 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 true)
+ ret void
+}
+
+define void @immarg_violations_usub_sat(ptr %ptr0, i32 %val32, i1 %val1, i64 %val64) {
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 %val64, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 %val64, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 0, i1 %val1)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
+declare i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
; CHECK: !0 = !{i32 5, i32 6}
; CHECK: !1 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index bff4771..0000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret void
-}
-
-define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: global_store_dword v[0:1], v0, off
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- store i32 %ret, ptr addrspace(1) poison
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret void
-}
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
-
-!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 4af2d58..d281492 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -2,190 +2,222 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
-
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr inbounds i32, ptr %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %gep = getelementptr i32, ptr %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr inbounds i32, ptr %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %gep = getelementptr i32, ptr %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr inbounds i32, ptr %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %gep = getelementptr i32, ptr %addr, i32 4
+ %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr %use
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %use
ret void
}
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
@@ -193,9 +225,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -203,15 +237,17 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
@@ -219,9 +255,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -229,38 +267,44 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
-; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
+; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(3) %use
ret void
}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
deleted file mode 100644
index 243cd59..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ /dev/null
@@ -1,219 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- ret void
-}
-
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" }
-