diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 1039 |
1 files changed, 919 insertions, 120 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 873fcee..f9a24fe 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX942 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX1250 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) @@ -38,6 +39,17 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -57,6 +69,13 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr poison @@ -71,12 +90,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: @@ -86,13 +105,31 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, 4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -121,6 +158,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -140,6 +188,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr poison @@ -154,12 +209,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: @@ -169,13 +224,31 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, 4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -204,6 +277,17 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -223,6 +307,13 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr poison @@ -237,12 +328,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: @@ -252,13 +343,30 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -287,6 +395,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -306,6 +425,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr poison @@ -320,12 +446,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: @@ -335,13 +461,30 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -370,6 +513,17 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -389,6 +543,13 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr poison @@ -403,12 +564,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: @@ -418,13 +579,31 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, 4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -453,6 +632,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -472,6 +662,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr poison @@ -486,12 +683,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: @@ -501,13 +698,31 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, 4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -536,6 +751,17 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -555,6 +781,13 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr poison @@ -569,12 +802,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: @@ -584,13 +817,30 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -619,6 +869,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -638,6 +899,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr poison @@ -652,12 +920,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: @@ -667,13 +935,30 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -702,6 +987,17 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -721,6 +1017,13 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr poison @@ -735,12 +1038,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: @@ -750,13 +1053,31 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, 4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -785,6 +1106,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -804,6 +1136,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr poison @@ -818,12 +1157,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: @@ -833,13 +1172,31 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, 4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -868,6 +1225,17 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -887,6 +1255,13 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr poison @@ -901,12 +1276,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: @@ -916,13 +1291,30 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -951,6 +1343,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -970,6 +1373,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr poison @@ -984,12 +1394,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: @@ -999,13 +1409,30 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, s10 +; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -1038,6 +1465,19 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1067,6 +1507,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1098,6 +1549,19 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1127,6 +1591,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1154,6 +1629,19 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1179,6 +1667,18 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat_agent: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1206,6 +1706,19 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat_system: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1246,6 +1759,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -1277,6 +1801,19 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1306,6 +1843,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1337,6 +1885,19 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1364,6 +1925,19 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1389,6 +1963,18 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1418,6 +2004,19 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat_system: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1458,6 +2057,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1 ret void @@ -1485,6 +2095,31 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; GFX942-NEXT: ds_add_f64 v2, v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: local_atomic_fadd_f64_noret: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s2 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB51_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret void @@ -1508,6 +2143,30 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { ; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: local_atomic_fadd_f64_rtn: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v4, v1 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB52_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret @@ -1534,6 +2193,29 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX942-NEXT: ds_add_f64 v2, v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB53_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1560,6 +2242,29 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX942-NEXT: ds_add_f64 v2, v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB54_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1586,6 +2291,29 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX942-NEXT: ds_add_f64 v2, v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_endpgm +; +; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB55_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1608,6 +2336,29 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: local_atomic_fadd_f64_rtn_pat: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, v0 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB56_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1631,6 +2382,30 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub ; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v4, v1 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB57_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret @@ -1654,6 +2429,30 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double ; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v4, v1 +; GFX1250-NEXT: ds_load_b64 v[0:1], v0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5] +; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB58_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret |