From 3cf539fb046457a444e93cefc87cca10cbd3b807 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 4 Apr 2024 10:14:16 +0100 Subject: [AMDGPU] Combine or remove redundant waitcnts at the end of each MBB (#87539) Call generateWaitcnt unconditionally at the end of SIInsertWaitcnts::insertWaitcntInBlock. Even if we don't need to generate a new waitcnt instruction it has the effect of combining or removing redundant waitcnts that were already present. Tests show various small improvements in waitcnt placement. --- .../AMDGPU/GlobalISel/divergent-control-flow.ll | 4 - .../AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll | 2 - .../AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll | 2 - .../AMDGPU/atomic_optimizations_local_pointer.ll | 86 ++++++++++------------ llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 2 +- .../CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll | 2 +- .../test/CodeGen/AMDGPU/extract-subvector-16bit.ll | 11 --- llvm/test/CodeGen/AMDGPU/function-args.ll | 1 - .../CodeGen/AMDGPU/lds-global-non-entry-func.ll | 28 +++---- llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 8 +- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 4 - .../transform-block-with-return-to-epilog.ll | 4 +- 12 files changed, 58 insertions(+), 96 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index c25b0f2..78d9084 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: .LBB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: .LBB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: .LBB2_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = trunc i32 %value to i1 @@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: .LBB3_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %value = load i32, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll index 303dc46..5c22d5b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll index 63702d2..e005c38 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 352adac..af6f6913 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 @@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] ; GFX8-NEXT: s_mov_b32 s7, 0xf000 @@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB7_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB9_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 @@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB14_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB14_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB15_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB16_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB17_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 @@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB19_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 @@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB21_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB21_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB23_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 19a1d2d9..c9076a9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index ac50fb8..da609bf 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: .LBB0_2: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300 ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 069c57e..6dabd8c 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB0_4: ; %exit -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 @@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB0_4: ; %exit -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_4: ; %exit -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 @@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB1_4: ; %exit -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX9-NEXT: .LBB2_4: ; %exit ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc @@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB2_4: ; %exit ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB3_4: ; %exit -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB4_4: ; %exit -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB5_4: ; %exit ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB7_4: ; %exit -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 @@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_4: ; %exit -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index db89ad6..3b2f15c 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CIGFX89-NEXT: s_waitcnt vmcnt(0) ; CIGFX89-NEXT: .LBB3_2: ; %bb2 ; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; CIGFX89-NEXT: s_waitcnt vmcnt(0) ; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i1_arg_i1_use: diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll index 433a836..3b3e107 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -33,7 +33,7 @@ define void @func_use_lds_global() { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() { ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-SDAG-NEXT: s_trap 2 @@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1 -; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-SDAG-NEXT: ; %bb.3: ; %bb0 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: .LBB2_4: ; %ret ; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v2 -; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: .LBB4_2: ; %ret ; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: func_uses_lds_phi_after: @@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: .LBB4_2: ; %ret ; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: func_uses_lds_phi_after: @@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB4_2: ; %ret ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: func_uses_lds_phi_after: @@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: .LBB4_2: ; %ret ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-LABEL: func_uses_lds_phi_after: @@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: .LBB4_3: ; %ret ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; SDAG-NEXT: .LBB4_4: ; SDAG-NEXT: s_endpgm @@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: .LBB4_3: ; %ret ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] ; GISEL-NEXT: .LBB4_4: ; GISEL-NEXT: s_endpgm @@ -616,6 +615,3 @@ ret: ; CHECK: {{.*}} ; GFX8: {{.*}} ; GFX9: {{.*}} - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 5e76dfd..4477f02 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: .LBB2_2: ; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_mov_b64 s[6:7], exec -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_readfirstlane_b32 s8, v1 ; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 ; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 @@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: ; %bb.7: ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: .LBB2_8: ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: v_add_f32_e32 v2, s2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 @@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-NEXT: s_cbranch_execz .LBB2_8 ; GFX9-NEXT: ; %bb.7: ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_8: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 138dd53..d19ef75 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB11_5: ; %end -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB11_6: ; GFX11-NEXT: s_mov_b64 exec, 0 @@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB13_5: ; %UnifiedReturnBlock -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB13_6: ; GFX11-NEXT: s_mov_b64 exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index eef5f57..ecebbb9 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: S_WAITCNT_soft 3952 + ; GCN-NEXT: S_WAITCNT 3952 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: entry: @@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a, ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: S_WAITCNT_soft 3952 + ; GCN-NEXT: S_WAITCNT 3952 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: entry: -- cgit v1.1 From 5b59ae423a9e86beddafb868b9d549b2f18825ab Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Thu, 4 Apr 2024 16:47:25 +0200 Subject: [DAG] Preserve NUW when reassociating (#87621) Similarly to the generic case below, preserve the NUW flag when reassociating adds with constants. --- llvm/test/CodeGen/AMDGPU/bf16.ll | 73 +-- llvm/test/CodeGen/AMDGPU/function-returns.ll | 87 ++- .../CodeGen/AMDGPU/gfx-callable-return-types.ll | 582 ++++++++------------- 3 files changed, 287 insertions(+), 455 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 9865883..bf4302c 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_store_b128 off, v[18:21], s0 offset:64 -; GFX11-NEXT: scratch_store_b128 off, v[10:13], s0 offset:32 -; GFX11-NEXT: scratch_store_b128 off, v[6:9], s0 offset:16 -; GFX11-NEXT: scratch_store_b128 off, v[2:5], s0 -; GFX11-NEXT: scratch_store_b16 off, v1, s0 offset:128 -; GFX11-NEXT: s_add_i32 s1, s0, 0x70 -; GFX11-NEXT: s_add_i32 s2, s0, 0x60 -; GFX11-NEXT: s_add_i32 s3, s0, 0x50 -; GFX11-NEXT: s_add_i32 s0, s0, 48 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[30:33], s1 -; GFX11-NEXT: scratch_store_b128 off, v[26:29], s2 -; GFX11-NEXT: scratch_store_b128 off, v[22:25], s3 -; GFX11-NEXT: scratch_store_b128 off, v[14:17], s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 +; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 @@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54 ; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58 ; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 -; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 -; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 -; GFX11-NEXT: s_add_i32 s4, s0, 0xc0 -; GFX11-NEXT: s_add_i32 s5, s0, 0xb0 -; GFX11-NEXT: s_add_i32 s6, s0, 0xa0 -; GFX11-NEXT: s_add_i32 s7, s0, 0x90 -; GFX11-NEXT: s_add_i32 s8, s0, 0x70 -; GFX11-NEXT: s_add_i32 s9, s0, 0x60 -; GFX11-NEXT: s_add_i32 s10, s0, 0x50 -; GFX11-NEXT: s_add_i32 s11, s0, 48 ; GFX11-NEXT: s_waitcnt vmcnt(31) ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; GFX11-NEXT: s_waitcnt vmcnt(30) @@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37 -; GFX11-NEXT: scratch_store_b128 off, v[96:99], s1 -; GFX11-NEXT: scratch_store_b128 off, v[84:87], s2 -; GFX11-NEXT: scratch_store_b128 off, v[80:83], s3 -; GFX11-NEXT: scratch_store_b128 off, v[68:71], s4 -; GFX11-NEXT: scratch_store_b128 off, v[64:67], s5 -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s6 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s7 -; GFX11-NEXT: scratch_store_b128 off, v[33:36], s0 offset:128 -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8 -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s9 -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s10 -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64 -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s11 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32 -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240 +; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224 +; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208 +; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192 +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160 +; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144 +; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <32 x bfloat> %load to <32 x double> diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index acadee2..401cbce 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 ; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 ; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 -; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 ; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s1, s0, 0x70 -; GFX11-NEXT: s_add_i32 s2, s0, 0x60 -; GFX11-NEXT: s_add_i32 s3, s0, 0x50 -; GFX11-NEXT: s_add_i32 s4, s0, 48 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0 +; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128 +; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <33 x i32>, ptr addrspace(1) %ptr @@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 ; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 ; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 -; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 ; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s1, s0, 0x70 -; GFX11-NEXT: s_add_i32 s2, s0, 0x60 -; GFX11-NEXT: s_add_i32 s3, s0, 0x50 -; GFX11-NEXT: s_add_i32 s4, s0, 48 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0 +; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128 +; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr @@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144 ; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128 ; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 -; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 -; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 -; GFX11-NEXT: s_add_i32 s4, s0, 0xc0 -; GFX11-NEXT: s_add_i32 s5, s0, 0xb0 -; GFX11-NEXT: s_add_i32 s6, s0, 0xa0 -; GFX11-NEXT: s_add_i32 s7, s0, 0x90 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:208 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:192 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:176 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:144 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:128 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v33, s0 +; GFX11-NEXT: scratch_store_b32 v0, v33, off ; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index c1d6826..3b078c4 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1989,256 +1989,138 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:1024 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:512 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:256 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:128 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:64 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:32 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:16 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: s_add_i32 s1, s0, 0x7f0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x7e0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x7d0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x7c0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x7b0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x7a0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x790 -; GFX11-NEXT: s_add_i32 s2, s0, 0x780 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x770 -; GFX11-NEXT: s_add_i32 s2, s0, 0x760 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x750 -; GFX11-NEXT: s_add_i32 s2, s0, 0x740 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x730 -; GFX11-NEXT: s_add_i32 s2, s0, 0x720 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x710 -; GFX11-NEXT: s_add_i32 s2, s0, 0x700 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x6f0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x6e0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x6d0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x6c0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x6b0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x6a0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x690 -; GFX11-NEXT: s_add_i32 s2, s0, 0x680 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x670 -; GFX11-NEXT: s_add_i32 s2, s0, 0x660 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x650 -; GFX11-NEXT: s_add_i32 s2, s0, 0x640 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x630 -; GFX11-NEXT: s_add_i32 s2, s0, 0x620 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x610 -; GFX11-NEXT: s_add_i32 s2, s0, 0x600 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x5f0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x5e0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x5d0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x5c0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x5b0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x5a0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x590 -; GFX11-NEXT: s_add_i32 s2, s0, 0x580 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x570 -; GFX11-NEXT: s_add_i32 s2, s0, 0x560 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x550 -; GFX11-NEXT: s_add_i32 s2, s0, 0x540 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x530 -; GFX11-NEXT: s_add_i32 s2, s0, 0x520 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x510 -; GFX11-NEXT: s_add_i32 s2, s0, 0x500 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x4f0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x4e0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x4d0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x4c0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x4b0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x4a0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x490 -; GFX11-NEXT: s_add_i32 s2, s0, 0x480 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x470 -; GFX11-NEXT: s_add_i32 s2, s0, 0x460 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x450 -; GFX11-NEXT: s_add_i32 s2, s0, 0x440 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x430 -; GFX11-NEXT: s_add_i32 s2, s0, 0x420 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x410 -; GFX11-NEXT: s_add_i32 s2, s0, 0x3f0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x3e0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x3d0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x3c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x3b0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x3a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x390 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x380 -; GFX11-NEXT: s_add_i32 s2, s0, 0x370 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x360 -; GFX11-NEXT: s_add_i32 s2, s0, 0x350 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x340 -; GFX11-NEXT: s_add_i32 s2, s0, 0x330 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x320 -; GFX11-NEXT: s_add_i32 s2, s0, 0x310 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x300 -; GFX11-NEXT: s_add_i32 s2, s0, 0x2f0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x2e0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x2d0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x2c0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x2b0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x2a0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x290 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x280 -; GFX11-NEXT: s_add_i32 s2, s0, 0x270 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x260 -; GFX11-NEXT: s_add_i32 s2, s0, 0x250 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x240 -; GFX11-NEXT: s_add_i32 s2, s0, 0x230 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x220 -; GFX11-NEXT: s_add_i32 s2, s0, 0x210 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x1f0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x1e0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x1d0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x1c0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x1b0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x1a0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x190 -; GFX11-NEXT: s_add_i32 s2, s0, 0x180 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x170 -; GFX11-NEXT: s_add_i32 s2, s0, 0x160 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x150 -; GFX11-NEXT: s_add_i32 s2, s0, 0x140 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x130 -; GFX11-NEXT: s_add_i32 s2, s0, 0x120 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x110 -; GFX11-NEXT: s_add_i32 s2, s0, 0xf0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0xe0 -; GFX11-NEXT: s_add_i32 s2, s0, 0xd0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0xc0 -; GFX11-NEXT: s_add_i32 s2, s0, 0xb0 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0xa0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x90 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x70 -; GFX11-NEXT: s_add_i32 s2, s0, 0x60 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2 -; GFX11-NEXT: s_add_i32 s1, s0, 0x50 -; GFX11-NEXT: s_add_i32 s0, s0, 48 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2016 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2000 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1984 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1968 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1952 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1936 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1920 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1904 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1888 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1872 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1856 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1840 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1824 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1808 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1792 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1776 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1760 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1744 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1728 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1712 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1696 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1680 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1664 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1648 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1632 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1616 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1600 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1584 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1568 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1552 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1536 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1520 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1504 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1488 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1472 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1456 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1440 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1424 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1408 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1392 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1376 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1360 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1344 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1328 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1312 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1296 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1280 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1264 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1248 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1232 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1216 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1200 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1184 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1168 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1152 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1136 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1120 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1104 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1088 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1072 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1056 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1040 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1024 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1008 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:992 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:976 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:960 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:944 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:928 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:912 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:896 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:880 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:864 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:848 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:832 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:816 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:800 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:784 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:768 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:752 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:736 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:720 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:704 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:688 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:672 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:656 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:640 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:624 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:608 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:592 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:576 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:560 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:544 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:528 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:512 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:496 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:480 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:464 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:448 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:432 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:416 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:400 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:384 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:368 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:352 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:336 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:320 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:304 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:288 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:272 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:256 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:224 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:208 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:192 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:176 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:160 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:144 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:128 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: ret <512 x i32> zeroinitializer @@ -2636,7 +2518,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_clause 0xc ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208 @@ -2651,93 +2532,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 ; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 -; GFX11-NEXT: s_clause 0x14 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:120 -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:136 -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32 +; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v10, off, s32 offset:152 -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16 +; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-NEXT: s_clause 0xd -; GFX11-NEXT: scratch_load_b32 v8, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v9, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 -; GFX11-NEXT: s_add_i32 s1, s0, 0x110 -; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: s_add_i32 s2, s0, 0x100 -; GFX11-NEXT: s_add_i32 s3, s0, 0xf0 -; GFX11-NEXT: s_add_i32 s34, s0, 0xe0 -; GFX11-NEXT: s_add_i32 s35, s0, 0xd0 -; GFX11-NEXT: s_add_i32 s36, s0, 0xc0 -; GFX11-NEXT: s_add_i32 s37, s0, 0xb0 -; GFX11-NEXT: s_add_i32 s38, s0, 0xa0 -; GFX11-NEXT: s_add_i32 s39, s0, 0x90 -; GFX11-NEXT: s_add_i32 s40, s0, 0x70 -; GFX11-NEXT: s_add_i32 s41, s0, 0x60 -; GFX11-NEXT: s_add_i32 s42, s0, 0x50 -; GFX11-NEXT: s_add_i32 s43, s0, 48 ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:128 +; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:272 ; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s1 +; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:256 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s2 +; GFX11-NEXT: scratch_store_b128 v0, v[16:19], off offset:240 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 off, v[60:63], s3 +; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 off, v[56:59], s34 +; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:208 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[41:44], s35 +; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:192 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[37:40], s36 +; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:176 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s37 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s38 +; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[33:36], s39 +; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s40 -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s41 -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s42 -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s43 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: s_clause 0xc ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 @@ -3306,7 +3176,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-LABEL: call_72xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s46, s33 +; GFX11-NEXT: s_mov_b32 s34, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 @@ -3353,11 +3223,11 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 +; GFX11-NEXT: s_add_i32 s2, s33, 0x200 +; GFX11-NEXT: v_writelane_b32 v60, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 -; GFX11-NEXT: s_add_i32 s0, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v60, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0 @@ -3373,14 +3243,14 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0 ; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0 ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 -; GFX11-NEXT: s_mov_b32 s45, return_72xi32@abs32@hi -; GFX11-NEXT: s_mov_b32 s44, return_72xi32@abs32@lo +; GFX11-NEXT: s_mov_b32 s1, return_72xi32@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, return_72xi32@abs32@lo ; GFX11-NEXT: v_writelane_b32 v60, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 ; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 -; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 +; GFX11-NEXT: s_add_i32 s2, s32, 0xa0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_mov_b32_e32 v32, v48 ; GFX11-NEXT: s_clause 0x9 @@ -3431,38 +3301,38 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 ; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9 ; GFX11-NEXT: v_mov_b32_e32 v9, v20 -; GFX11-NEXT: scratch_store_b32 off, v11, s0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x90 +; GFX11-NEXT: scratch_store_b32 off, v11, s2 +; GFX11-NEXT: s_add_i32 s2, s32, 0x90 ; GFX11-NEXT: v_mov_b32_e32 v11, v22 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x80 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s2 +; GFX11-NEXT: s_add_i32 s2, s32, 0x80 ; GFX11-NEXT: v_mov_b32_e32 v5, v16 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: v_mov_b32_e32 v0, 24 -; GFX11-NEXT: s_add_i32 s0, s32, 0x70 +; GFX11-NEXT: s_add_i32 s2, s32, 0x70 ; GFX11-NEXT: v_mov_b32_e32 v6, v17 -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s0 +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s2 ; GFX11-NEXT: v_mov_b32_e32 v13, v24 -; GFX11-NEXT: s_add_i32 s0, s32, 0x6c +; GFX11-NEXT: s_add_i32 s2, s32, 0x6c ; GFX11-NEXT: v_mov_b32_e32 v7, v18 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x60 +; GFX11-NEXT: scratch_store_b32 off, v0, s2 +; GFX11-NEXT: s_add_i32 s2, s32, 0x60 ; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 -; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x50 +; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2 +; GFX11-NEXT: s_add_i32 s2, s32, 0x50 ; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 -; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 64 +; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2 +; GFX11-NEXT: s_add_i32 s2, s32, 64 ; GFX11-NEXT: v_mov_b32_e32 v14, v25 -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 48 +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2 +; GFX11-NEXT: s_add_i32 s2, s32, 48 ; GFX11-NEXT: v_mov_b32_e32 v16, v27 -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 32 +; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2 +; GFX11-NEXT: s_add_i32 s2, s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v30, v46 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 16 -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 +; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2 +; GFX11-NEXT: s_add_i32 s2, s32, 16 +; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 ; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 @@ -3470,10 +3340,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 -; GFX11-NEXT: s_add_i32 s0, s33, 0x400 +; GFX11-NEXT: s_add_i32 s2, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45] +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_clause 0xb ; GFX11-NEXT: scratch_load_b32 v59, off, s33 ; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 @@ -3493,7 +3363,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 -; GFX11-NEXT: s_mov_b32 s33, s46 +; GFX11-NEXT: s_mov_b32 s33, s34 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: -- cgit v1.1