aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll86
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll73
-rw-r--r--llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-returns.ll87
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll582
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/skip-if-dead.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll4
15 files changed, 345 insertions, 551 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index c25b0f2..78d9084 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB0_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB1_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB2_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB3_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%value = load i32, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 303dc46..5c22d5b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 63702d2..e005c38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 352adac..af6f6913 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
@@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
@@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB7_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB9_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB14_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB14_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB15_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB15_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB16_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB16_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB17_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB17_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
@@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB19_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
@@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB20_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB20_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB21_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB21_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
@@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
@@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB23_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB23_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 19a1d2d9..c9076a9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9865883..bf4302c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_clause 0x4
-; GFX11-NEXT: scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT: scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT: scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT: scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT: scratch_store_b16 off, v1, s0 offset:128
-; GFX11-NEXT: s_add_i32 s1, s0, 0x70
-; GFX11-NEXT: s_add_i32 s2, s0, 0x60
-; GFX11-NEXT: s_add_i32 s3, s0, 0x50
-; GFX11-NEXT: s_add_i32 s0, s0, 48
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
+; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
+; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
+; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
+; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
+; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT: scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT: scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT: scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
+; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
%ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
@@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT: s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT: s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT: s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT: s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT: s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT: s_add_i32 s7, s0, 0x90
-; GFX11-NEXT: s_add_i32 s8, s0, 0x70
-; GFX11-NEXT: s_add_i32 s9, s0, 0x60
-; GFX11-NEXT: s_add_i32 s10, s0, 0x50
-; GFX11-NEXT: s_add_i32 s11, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; GFX11-NEXT: s_waitcnt vmcnt(30)
@@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT: scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT: scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT: scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT: scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT: scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT: scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT: scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT: scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT: scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT: scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT: scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240
+; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224
+; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208
+; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192
+; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176
+; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
+; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
+; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
+; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <32 x bfloat> %load to <32 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index ac50fb8..da609bf 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
; GCN-NEXT: .LBB0_2: ; %endif
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 069c57e..6dabd8c 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB0_4: ; %exit
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB0_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB1_4: ; %exit
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB1_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX9-NEXT: .LBB2_4: ; %exit
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800
; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB2_4: ; %exit
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB3_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB4_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB5_4: ; %exit
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB7_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900
; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00
@@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB8_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900
; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index db89ad6..3b2f15c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: .LBB3_2: ; %bb2
; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee2..401cbce 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_add_i32 s1, s0, 0x70
-; GFX11-NEXT: s_add_i32 s2, s0, 0x60
-; GFX11-NEXT: s_add_i32 s3, s0, 0x50
-; GFX11-NEXT: s_add_i32 s4, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
%val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_add_i32 s1, s0, 0x70
-; GFX11-NEXT: s_add_i32 s2, s0, 0x60
-; GFX11-NEXT: s_add_i32 s3, s0, 0x50
-; GFX11-NEXT: s_add_i32 s4, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
%val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT: s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT: s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT: s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT: s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT: s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT: s_add_i32 s7, s0, 0x90
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:224
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:208
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:192
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:176
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:160
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:144
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:128
+; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v33, s0
+; GFX11-NEXT: scratch_store_b32 v0, v33, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
%val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c1d6826..3b078c4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1989,256 +1989,138 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_clause 0x7
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:1024
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:512
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:256
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:128
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:64
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:32
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 offset:16
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT: s_add_i32 s1, s0, 0x7f0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x7d0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x7c0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x7b0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x7a0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x790
-; GFX11-NEXT: s_add_i32 s2, s0, 0x780
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x770
-; GFX11-NEXT: s_add_i32 s2, s0, 0x760
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x750
-; GFX11-NEXT: s_add_i32 s2, s0, 0x740
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x730
-; GFX11-NEXT: s_add_i32 s2, s0, 0x720
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x710
-; GFX11-NEXT: s_add_i32 s2, s0, 0x700
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x6f0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x6e0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x6d0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x6c0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x6b0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x6a0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x690
-; GFX11-NEXT: s_add_i32 s2, s0, 0x680
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x670
-; GFX11-NEXT: s_add_i32 s2, s0, 0x660
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x650
-; GFX11-NEXT: s_add_i32 s2, s0, 0x640
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x630
-; GFX11-NEXT: s_add_i32 s2, s0, 0x620
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x610
-; GFX11-NEXT: s_add_i32 s2, s0, 0x600
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x5f0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x5e0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x5d0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x5c0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x5b0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x5a0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x590
-; GFX11-NEXT: s_add_i32 s2, s0, 0x580
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x570
-; GFX11-NEXT: s_add_i32 s2, s0, 0x560
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x550
-; GFX11-NEXT: s_add_i32 s2, s0, 0x540
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x530
-; GFX11-NEXT: s_add_i32 s2, s0, 0x520
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x510
-; GFX11-NEXT: s_add_i32 s2, s0, 0x500
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x4f0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x4e0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x4d0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x4c0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x4b0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x4a0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x490
-; GFX11-NEXT: s_add_i32 s2, s0, 0x480
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x470
-; GFX11-NEXT: s_add_i32 s2, s0, 0x460
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x450
-; GFX11-NEXT: s_add_i32 s2, s0, 0x440
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x430
-; GFX11-NEXT: s_add_i32 s2, s0, 0x420
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x410
-; GFX11-NEXT: s_add_i32 s2, s0, 0x3f0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x3e0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x3d0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x3c0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x3b0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x3a0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x390
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x380
-; GFX11-NEXT: s_add_i32 s2, s0, 0x370
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x360
-; GFX11-NEXT: s_add_i32 s2, s0, 0x350
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x340
-; GFX11-NEXT: s_add_i32 s2, s0, 0x330
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x320
-; GFX11-NEXT: s_add_i32 s2, s0, 0x310
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x300
-; GFX11-NEXT: s_add_i32 s2, s0, 0x2f0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x2e0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x2d0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x2c0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x2b0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x2a0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x290
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x280
-; GFX11-NEXT: s_add_i32 s2, s0, 0x270
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x260
-; GFX11-NEXT: s_add_i32 s2, s0, 0x250
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x240
-; GFX11-NEXT: s_add_i32 s2, s0, 0x230
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x220
-; GFX11-NEXT: s_add_i32 s2, s0, 0x210
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x1f0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x1e0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x1d0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x1c0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x1b0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x1a0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x190
-; GFX11-NEXT: s_add_i32 s2, s0, 0x180
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x170
-; GFX11-NEXT: s_add_i32 s2, s0, 0x160
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x150
-; GFX11-NEXT: s_add_i32 s2, s0, 0x140
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x130
-; GFX11-NEXT: s_add_i32 s2, s0, 0x120
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x110
-; GFX11-NEXT: s_add_i32 s2, s0, 0xf0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0xe0
-; GFX11-NEXT: s_add_i32 s2, s0, 0xd0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0xc0
-; GFX11-NEXT: s_add_i32 s2, s0, 0xb0
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0xa0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x90
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x70
-; GFX11-NEXT: s_add_i32 s2, s0, 0x60
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT: s_add_i32 s1, s0, 0x50
-; GFX11-NEXT: s_add_i32 s0, s0, 48
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2016
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2000
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1984
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1968
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1952
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1936
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1920
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1904
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1888
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1872
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1856
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1840
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1824
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1808
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1792
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1776
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1760
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1744
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1728
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1712
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1696
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1680
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1664
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1648
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1632
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1616
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1600
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1584
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1568
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1552
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1536
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1520
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1504
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1488
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1472
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1456
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1440
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1424
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1408
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1392
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1376
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1360
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1344
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1328
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1312
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1296
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1280
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1264
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1248
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1232
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1216
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1200
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1184
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1168
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1152
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1136
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1120
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1104
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1088
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1072
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1056
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1040
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1024
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1008
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:992
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:976
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:960
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:944
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:928
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:912
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:896
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:880
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:864
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:848
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:832
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:816
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:800
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:784
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:768
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:752
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:736
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:720
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:704
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:688
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:672
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:656
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:640
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:624
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:608
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:592
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:576
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:560
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:544
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:528
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:512
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:496
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:480
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:464
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:448
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:432
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:416
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:400
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:384
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:368
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:352
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:336
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:320
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:304
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:288
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:272
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:256
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:224
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:208
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:192
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:176
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:160
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:144
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:128
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:64
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:16
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret <512 x i32> zeroinitializer
@@ -2636,7 +2518,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-LABEL: return_72xi32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_clause 0xc
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208
@@ -2651,93 +2532,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT: s_clause 0x14
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:124
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:120
-; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT: s_clause 0x11
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112
+; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108
+; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:136
-; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128
+; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124
+; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v10, off, s32 offset:152
-; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144
+; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140
+; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: s_clause 0xd
-; GFX11-NEXT: scratch_load_b32 v8, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v9, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:148
+; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132
+; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:116
+; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:100
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:84
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v32, off, s32
-; GFX11-NEXT: s_add_i32 s1, s0, 0x110
-; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT: s_add_i32 s2, s0, 0x100
-; GFX11-NEXT: s_add_i32 s3, s0, 0xf0
-; GFX11-NEXT: s_add_i32 s34, s0, 0xe0
-; GFX11-NEXT: s_add_i32 s35, s0, 0xd0
-; GFX11-NEXT: s_add_i32 s36, s0, 0xc0
-; GFX11-NEXT: s_add_i32 s37, s0, 0xb0
-; GFX11-NEXT: s_add_i32 s38, s0, 0xa0
-; GFX11-NEXT: s_add_i32 s39, s0, 0x90
-; GFX11-NEXT: s_add_i32 s40, s0, 0x70
-; GFX11-NEXT: s_add_i32 s41, s0, 0x60
-; GFX11-NEXT: s_add_i32 s42, s0, 0x50
-; GFX11-NEXT: s_add_i32 s43, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(10)
-; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:272
; GFX11-NEXT: s_waitcnt vmcnt(9)
-; GFX11-NEXT: scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:256
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT: scratch_store_b128 v0, v[16:19], off offset:240
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:224
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:208
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:192
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:176
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: scratch_store_b128 off, v[33:36], s39
+; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b128 off, v[29:32], s40
-; GFX11-NEXT: scratch_store_b128 off, v[25:28], s41
-; GFX11-NEXT: scratch_store_b128 off, v[21:24], s42
-; GFX11-NEXT: scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: s_clause 0xc
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168
@@ -3306,7 +3176,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-LABEL: call_72xi32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s46, s33
+; GFX11-NEXT: s_mov_b32 s34, s33
; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00
@@ -3353,11 +3223,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: s_add_i32 s0, s32, 32
; GFX11-NEXT: s_add_i32 s1, s32, 16
+; GFX11-NEXT: s_add_i32 s2, s33, 0x200
+; GFX11-NEXT: v_writelane_b32 v60, s30, 0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT: s_add_i32 s0, s33, 0x200
-; GFX11-NEXT: v_writelane_b32 v60, s30, 0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
@@ -3373,14 +3243,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT: s_mov_b32 s45, return_72xi32@abs32@hi
-; GFX11-NEXT: s_mov_b32 s44, return_72xi32@abs32@lo
+; GFX11-NEXT: s_mov_b32 s1, return_72xi32@abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_72xi32@abs32@lo
; GFX11-NEXT: v_writelane_b32 v60, s31, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624
; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640
-; GFX11-NEXT: s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT: s_add_i32 s2, s32, 0xa0
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v32, v48
; GFX11-NEXT: s_clause 0x9
@@ -3431,38 +3301,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
; GFX11-NEXT: v_mov_b32_e32 v9, v20
-; GFX11-NEXT: scratch_store_b32 off, v11, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x90
+; GFX11-NEXT: scratch_store_b32 off, v11, s2
+; GFX11-NEXT: s_add_i32 s2, s32, 0x90
; GFX11-NEXT: v_mov_b32_e32 v11, v22
-; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x80
+; GFX11-NEXT: scratch_store_b128 off, v[4:7], s2
+; GFX11-NEXT: s_add_i32 s2, s32, 0x80
; GFX11-NEXT: v_mov_b32_e32 v5, v16
-; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2
; GFX11-NEXT: v_mov_b32_e32 v0, 24
-; GFX11-NEXT: s_add_i32 s0, s32, 0x70
+; GFX11-NEXT: s_add_i32 s2, s32, 0x70
; GFX11-NEXT: v_mov_b32_e32 v6, v17
-; GFX11-NEXT: scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT: scratch_store_b128 off, v[12:15], s2
; GFX11-NEXT: v_mov_b32_e32 v13, v24
-; GFX11-NEXT: s_add_i32 s0, s32, 0x6c
+; GFX11-NEXT: s_add_i32 s2, s32, 0x6c
; GFX11-NEXT: v_mov_b32_e32 v7, v18
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x60
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
+; GFX11-NEXT: s_add_i32 s2, s32, 0x60
; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
-; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x50
+; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2
+; GFX11-NEXT: s_add_i32 s2, s32, 0x50
; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
-; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0
-; GFX11-NEXT: s_add_i32 s0, s32, 64
+; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2
+; GFX11-NEXT: s_add_i32 s2, s32, 64
; GFX11-NEXT: v_mov_b32_e32 v14, v25
-; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0
-; GFX11-NEXT: s_add_i32 s0, s32, 48
+; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2
+; GFX11-NEXT: s_add_i32 s2, s32, 48
; GFX11-NEXT: v_mov_b32_e32 v16, v27
-; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT: s_add_i32 s0, s32, 32
+; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2
+; GFX11-NEXT: s_add_i32 s2, s32, 32
; GFX11-NEXT: v_mov_b32_e32 v30, v46
-; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT: s_add_i32 s0, s32, 16
-; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2
+; GFX11-NEXT: s_add_i32 s2, s32, 16
+; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2
; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, 42
@@ -3470,10 +3340,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572
; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556
; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540
-; GFX11-NEXT: s_add_i32 s0, s33, 0x400
+; GFX11-NEXT: s_add_i32 s2, s33, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_clause 0xb
; GFX11-NEXT: scratch_load_b32 v59, off, s33
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4
@@ -3493,7 +3363,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_addk_i32 s32, 0xf600
-; GFX11-NEXT: s_mov_b32 s33, s46
+; GFX11-NEXT: s_mov_b32 s33, s34
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 433a836..3b3e107 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -33,7 +33,7 @@ define void @func_use_lds_global() {
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_trap 2
@@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2
; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_4
; GFX8-SDAG-NEXT: ; %bb.3: ; %bb0
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: .LBB2_4: ; %ret
; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: ds_write_b32 v0, v2
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 1
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: .LBB4_2: ; %ret
; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
@@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX8-GISEL-NEXT: .LBB4_2: ; %ret
; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
@@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB4_2: ; %ret
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
@@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: .LBB4_2: ; %ret
; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-LABEL: func_uses_lds_phi_after:
@@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: .LBB4_3: ; %ret
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
; SDAG-NEXT: .LBB4_4:
; SDAG-NEXT: s_endpgm
@@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: .LBB4_3: ; %ret
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GISEL-NEXT: .LBB4_4:
; GISEL-NEXT: s_endpgm
@@ -616,6 +615,3 @@ ret:
; CHECK: {{.*}}
; GFX8: {{.*}}
; GFX9: {{.*}}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5e76dfd..4477f02 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; VI-NEXT: .LBB2_2:
; VI-NEXT: s_or_b64 exec, exec, s[6:7]
; VI-NEXT: s_mov_b64 s[6:7], exec
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_readfirstlane_b32 s8, v1
; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0
; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; VI-NEXT: ; %bb.7:
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_add_rtn_f32 v2, v2, v1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: .LBB2_8:
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_readfirstlane_b32 s2, v2
; VI-NEXT: v_add_f32_e32 v2, s2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX9-NEXT: .LBB2_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX9-NEXT: s_cbranch_execz .LBB2_8
; GFX9-NEXT: ; %bb.7:
; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_8:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_add_f32_e32 v0, s2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 138dd53..d19ef75 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB11_5: ; %end
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB11_6:
; GFX11-NEXT: s_mov_b64 exec, 0
@@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB13_5: ; %UnifiedReturnBlock
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB13_6:
; GFX11-NEXT: s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index eef5f57..ecebbb9 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: S_WAITCNT_soft 3952
+ ; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.3:
entry:
@@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: S_WAITCNT_soft 3952
+ ; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.5:
entry: