aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll1039
1 files changed, 919 insertions, 120 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fcee..f9a24fe 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX942
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX1250
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
@@ -38,6 +39,17 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -57,6 +69,13 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -71,12 +90,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -86,13 +105,31 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -121,6 +158,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -140,6 +188,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -154,12 +209,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -169,13 +224,31 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -204,6 +277,17 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -223,6 +307,13 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -237,12 +328,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -252,13 +343,30 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -287,6 +395,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -306,6 +425,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -320,12 +446,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -335,13 +461,30 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -370,6 +513,17 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -389,6 +543,13 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -403,12 +564,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -418,13 +579,31 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -453,6 +632,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -472,6 +662,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -486,12 +683,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -501,13 +698,31 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -536,6 +751,17 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -555,6 +781,13 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -569,12 +802,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -584,13 +817,30 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -619,6 +869,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -638,6 +899,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -652,12 +920,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -667,13 +935,30 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -702,6 +987,17 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -721,6 +1017,13 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -735,12 +1038,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -750,13 +1053,31 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -785,6 +1106,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -804,6 +1136,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -818,12 +1157,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -833,13 +1172,31 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -868,6 +1225,17 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -887,6 +1255,13 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -901,12 +1276,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -916,13 +1291,30 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -951,6 +1343,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -970,6 +1373,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -984,12 +1394,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -999,13 +1409,30 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -1038,6 +1465,19 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1067,6 +1507,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1098,6 +1549,19 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1127,6 +1591,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1154,6 +1629,19 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1179,6 +1667,18 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1206,6 +1706,19 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat_system:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1246,6 +1759,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
ret void
@@ -1277,6 +1801,19 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1306,6 +1843,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1337,6 +1885,19 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1364,6 +1925,19 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1389,6 +1963,18 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1418,6 +2004,19 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1458,6 +2057,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1
ret void
@@ -1485,6 +2095,31 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_noret:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_f64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB51_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
ret void
@@ -1508,6 +2143,30 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_rtn:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v4, v1
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB52_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
ret double %ret
@@ -1534,6 +2193,29 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB53_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1560,6 +2242,29 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1586,6 +2291,29 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB55_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
ret void
@@ -1608,6 +2336,29 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_rtn_pat:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, v0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB56_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -1631,6 +2382,30 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v4, v1
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB57_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
ret double %ret
@@ -1654,6 +2429,30 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v4, v1
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
+; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB58_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
ret double %ret